[clang] [clang-tools-extra] [libc] [llvm] [libc][math] Added missing floating point exception for atanpif16 (PR #186597)

Tue Mar 17 09:55:03 PDT 2026

https://github.com/Sukumarsawant updated https://github.com/llvm/llvm-project/pull/186597

>From 0174c55f047cb20bd12c9a747c48861e19bff09c Mon Sep 17 00:00:00 2001
From: Sukumarsawant <sawantsukumar at gmail.com>
Date: Sat, 14 Mar 2026 18:49:20 +0530
Subject: [PATCH 1/7] fix: fp exception for underflow

fix: fp exception for underflow

fix: fp exception for underflow

final
---
 .gitignore                           |   4 ++++
 libc/src/math/generic/atanpif16.cpp  |  13 ++++++++++++-
 libc/src/math/generic/libatanpif16.a | Bin 0 -> 114088 bytes
 3 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/math/generic/libatanpif16.a

diff --git a/.gitignore b/.gitignore
index fa133b2d09834..9294f96ad0adf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,3 +87,7 @@ pythonenv*
 /clang/utils/analyzer/projects/*/RefScanBuildResults
 # automodapi puts generated documentation files here.
 /lldb/docs/python_api/
+
+/libc/src/math/generic/wrapper.cpp
+/libc/src/math/generic/atanpif16.o 
+/libc/src/math/generic/wrapper.o 
\ No newline at end of file
diff --git a/libc/src/math/generic/atanpif16.cpp b/libc/src/math/generic/atanpif16.cpp
index c54087c7165fe..6682b886fa7a7 100644
--- a/libc/src/math/generic/atanpif16.cpp
+++ b/libc/src/math/generic/atanpif16.cpp
@@ -120,7 +120,18 @@ LLVM_LIBC_FUNCTION(float16, atanpif16, (float16 x)) {
   // Case 1: |x| <= 0.5 - Direct polynomial evaluation
   if (LIBC_LIKELY(x_abs <= 0.5)) {
     double result = atanpi_eval(x_abs);
-    return signed_result(result);
+    float16 s_result = signed_result(result);
+    // clear the underflow raised by casting
+    fputil::clear_except_if_required(FE_UNDERFLOW);
+    int rounding = fputil::quick_get_round();
+    // values checked through exhaustive testing which rounded up/down and caused spurious or missing underflow
+    bool except_value = (rounding == FE_UPWARD && xbits.uintval() == 0x0a48) ||
+                        (rounding == FE_DOWNWARD && xbits.uintval() == 0x8a48);
+
+    if (result != 0.0 && result < 0x1p-14 && !except_value)
+      fputil::raise_except_if_required(FE_UNDERFLOW);
+
+    return s_result;
   }
 
   // case 2: 0.5 < |x| <= 1 - use double-angle reduction
diff --git a/libc/src/math/generic/libatanpif16.a b/libc/src/math/generic/libatanpif16.a
new file mode 100644
index 0000000000000000000000000000000000000000..9ed17bd932e5928927ec005fa26d170c465af316
GIT binary patch
literal 114088
zcmeFa3w%|@wLiXdj*s|iqE%~s)DRy<Ma&5SLPd?j#zX@~9*T;dggoGAUgm+=LJbCN
zLZs5Ft+rTcZy(&&w)D1EYEgq#K(DQ-wUt)d#@1 at +rIsp1^iuhM*UXxI*4cY@&N+bI
z|NVXLe8`zO-<etKJ8RZF_f8lwv8Jtd{)LC1SCATt#8;h`p9KZ6lH#HgRi^rI$8o;v
zIOF^O at 8|z)@c(Nc$a!RrBYxu7PK(9j at y5mlP4ULW+}a5>9W~9ZiH6uXck-0jxVYml
zFs`Asvm?<sZc5c9iH`OPRmh!G%n5g<s&RRWOs-OzXzr+QYp!XWFr{jGeQQNiYhy*_
zv>8Ql$eMX;YrR|K@;4#bb;l*z<IVN++yzACKnjkHf#gJUL%gLe-dxkH6{HX!UsvBz
zlW1%&F0NnLT3_4I9-rIN(ipF8X|79jBwCs)=DP0OneiFL88o_7HF`#UQ^svJl!k9}
zR&rxx%=W}QbV8zeUP!C`-IbMw2{(5p>N}L=X*1#_GoZ`u2<^F4FB(4&H5b%0X42JJ
zks6Oyf3m)<B@@BfIp>&~x$PFmVH1|r*0gt2)afw-=Z=r5ZEc-U-&`}du|A$?sF+<-
z)80my8=C-%4UH`|9n>$aEf^~C1r;;n?(q3Vao3$u5_d~FX#C6=n}Hu@*S_(cVB&R2
z%HZ(3Np^xK6=&EPGdfz at YUb7ZM~z!TZfujGd at hSgBP&l*-IQofbky5aTTBv0KSomu
z4aN9eOrzm0Bqz-<S*7m6)|Te_=8iF#&Gt`JtmKkJO?!Qvk%(@Z1_$#rxMAx-COl&?
znhp{TiQ1axx{~C8Oy^c%?H*U(T-#DtuNeA?Yfvgw)DPrbT8tr^obA-|w685mvCNpJ
z_PP7ni?ZF5DvDJX(h^%&(cDqd;ktTC9+quO8%!9BV^oKkKZUfJe1?@djId;1@}fbU
zGwf8G&eW9!`xCTU?@er_^+c`a={gtf{$M8R6eeog8dG(}jPye+WX7VQsy0(<`-FD5
zw#B$9?P6T#X0s4yH|@uYX^n5HY3 at k0x7TDchlVsZe6N_<92g=tXhSupdRv*W=`R+W
zd}~czqIL=`;T4rl<4Rm_Q#`ZT;!wk{>sZzkyr@?uiqMlY^iJG5ec5yu*R)sERnCpM
z3&`!m=cd+{##`&LzE{*;R;L4$S?D&_w|C5pS2mTV>^Dl{7^IV1XBLGX6W~eB^=*mT
z*jUVzwKv77_uE=Jo9i-|;6l2{-t%TZkus*TSdXps^_ZOIUYfz$G0fanN=yCzFZ0PM
z(lRrKj_OPn=2CBMxY3P`?P!U&>rGc&Z7ajQM;z$F771_X5Lkv1 at U+AMqMdMS7bG%p
zso^?F*Z|IOW7tT~&YlU(0=kv^>X<3LF4y(_^hvL|9S6M2g=n;1o&IdCH%oLK&6HTK
zGPybknG=H3wmJap*9kf(Gwk$iot34ZSZ_Nl^0QtX!~D6}4L=|GC&jExx?hW{edgY;
z1-9;gYH?M2#S9i$F-2zR1!g5m>s8h%7d?N#j+T(+P2c- at v?N;Z+tHGQ%b9V1kCu>^
zGVA(?V}FDNSSrk5D$aP6#nK)tFG<X!W1QM#R?aPQn`r(_sEuAm%V~ywSRA9x8n!Nt
ziQ0rps5^%J*s}*`JfDu#b3y8wVQqDj*Xq>5`i|PRF`m%e($<9A&rc3_fn%!}E8?we
z99wH|^jkN}>2LKKJKEdK@`tTGjoB&FFP=2B;)-eUN_Uz*KTw9g*h{|y%h<il>|(Q*
zT(ZI at m;J3j?7-URJ(qvP1_tN8HrJc~4)pdFjFY?W{oTH%2)mKCocq3gg{+jhEzs~L
z?AyaH5i;?iWWz&`8~eQYC1p1q(0y%+iT86?pCXH9Z*9+g_0g2>3cEZEPM_oI7k1R+
zAl@`LFzo1&#Mqj)wwhZjnkt)OIHxV#&n}ORovZGZX+6n!85w4G>)%Y&E=awmKA@{q
z>f at t*?Iv$3IRM)ePu7I?&bfL;%i#1|e>1i&;a9pTZi&-5-#P7fd&e{-tPg7T`5>aU
zNWO;e`-GAD48|uXIBbS-1{yZ8SS_x%IvUmU#tPh|jSGrUS8PRb`5;g=X)c}y;k=+9
z{CO9E(=X at edZ`@|6x%q%GLH}^Z%&R)XzpyH>o`2>Z^C1zc!Hk6H_<W19n(~^(BeZ`
z)X%J!hZ)od;+~9_FW-JWu!7x)4p0kZYG(&#y|Hua=h+(-C_08_(%?32LQ`jBN8*;2
zwz^cm(6g$Wn0`6mweoB`Gd>%ash}5_l9Q6gw$D#AVEa#({Szi9+VEZiUPZvm6Y4CW
zrfYM9Yox{?wwT0XNL!ZfKx at o&pW<;;%GV0o{igbHJ3r2*S1UT3=@D&xozI_VjG^0N
z|7$0nq4u|V?zGEiH8#0L@!Cc;9Nuq5Mo;Q%G?;2t-`YmMHb83m4Cg}o4T at e+;7R&*
z3-X;*&t~biV~sw6bv9d<=al~6FN(1uOfPX>sfJ<dJ-lyi;fLbBmRP2}cI)0GSg<MK
zUV-H=Ftl(>m{ngj*QYI?mULUDq-z(<jMKBr;N0Tx4@|7qlfB(%V#T#>^)(&!^uA4J
zJxv(ec{)O`YrRz at YzD`3s(ss+WFjzRd9oP6dU6G~lx=cs3_XvD*EBZ9u^qL(e-U&b
zY}EY|Hc#XMpR$bmSTVLN at s<VkZFtY;7D%%BqNScWai{eu>#S`NXf2m^eTnV+`eNZw
z+|fc_&8wf>2_-L2^y_&Y_4C?l8uc2|jB68GhFp3Lr at f{L-dlmq5#GYXXU2CW!?;`K
zTZfdXF=@QmSnA|k5*_pLs$g4N%Psl<rB~&d@#)-PUYPOXAg}l2S0(WZ1;nfy0E=X+
z{f1eAvGABT#m2<(1^=eo79-IdT>G{**3{PH at nv&MYem(RCWPrtuJU_sRZ%?Hu(4>E
zY6mr}6b&OlZ9xJ<C3Rd>;Cg{xsf>Y|nya&2zccpJ{oN>JtH{1l at K0cAUb~^~ZOY}A
z^=zQ8KhsXw5SF}-JFu_asAqcU%1oZ=Ww&tou8gy}PYAgrwJ(r9Jy~B(cX(IRzKJ{h
zZhCwJUKge7W#5|{Rsm^ymzKN(dFw**d?zdWtxL+(==bkoCSPy!km&_Un)?jek9d!w
z_wzo3WVz-o%l*60u)EuVyVD at 5sdpN~c1Xye2tKQ^oHB+L95$?jlEe0*O_sB&WiO$&
z$e3%?7CG%$vF?$BoKjzW_DKzH4QW3E3LofxCh+c-x>pLky11{eZk77wzrW(&SLyGk
z at b^*n`zG4zslzx5Hf_Fr7|-j&yx-@>76$fuW8Jp4=9ZcncqPjFJ)=OOSP}gqQoNpu
z1~0;5CCS`)%iJ4XCn;)li~1G5u;1GoSE7GgZ$fE|Q`Lvx=2lLL<mBMj_jJf{E_9OL
z4C*)qhb7T61(!L$MZH<&y?-fRvVA3>hP<cyorYd_Pea~Q?jEOn#o~8|sHB%TJ*UTg
zXSw?h#k)N;@5j5lU(bo{Qf1fV<SjcGb++9tD|+Y+R<s1 at 6GU(5 at u=f$bpPq({dn;|
zSt1DSf-WuIP<(IO?I@{vCb4E<D7yR2ynN!|4Be#)J*`^x-nJ`rwr-T2^|7esmwJM%
z1~#uj#cQgu>f_D4qun8BR3Ar?+J?N*8}jby&-?O*g3Wd{r=nB}(bpnW%@F-2zAaZR
z=yCsI8Uv~BP~P4BTH at o%T%^sj?V*4a4 at myY&`!u8YFh}}Ldu4g2qnLHA|G{!Za~|H
zZpBBhJFsN&fRpPudCLlrg9>b?M>em`S at KRfenGTn*=>9G?#)|zKJsJzRQw<UKTY7W
z6H_boUr at drvhL~E`O8%)EnvA4Lq7Dp;+A*yGs<Gij(~_f)$VN{M^ZL?OWqySHM`vB
z;a~K-s+#^?Xfk!zZ-ns at +6>Bk+Ge+RYiZO&*YuN7 at R6@RRDS_k(TYaAnKygG&?05G
z{ChQUw7ZUqQvXgxYWA~l=H0M?B&;QPt+MoTrEs<qyoTU4%Gw`R+)S7LN;C*ENaE4%
zYAQ`1Nw>M<X66d`8p=D`U5Ns!HPG*_9C}%)JKF6 at sf@g-Ks327%IZ?&{swV*BR3~S
z<R1sb8G4wd+_$aJC{*Pi^Ov8antLBfyic`@df#(ZGX8r``I4t-{<!{Ud8g|6q<edQ
z at 8Ugui}wsY06pd1Pvuiy7s{CYg7RJ(r$@Ux%KO~iyPTZ7rDr0S)a+;|&--+L`A?`c
zLRWcgGmz$fDm}D~I$xDiUnw=Dh9Q&`T>rC%m1;ig-d^O%=fc;xJG>eyHK^+1gMC07
z2H#I1t%%Ry_*{<1a(sb`C!0>0oOkp=rz;BD&MpLBvjKF9Z1(uM%^s>Ym7c8aLquVm
zeEPVuAFXO=7 at c?OQqrsJ$N5K3rH0IcE2)zCv+y;4D!yLxB)KvPEYtRet;x)JkD!kr
z%rCDz*<aLn%yIALJ%aIHgfFpv(|t+()VJ=Q?!k-l9?|T4{77Cy?{3I>Z)g-`^pa}#
zJybZt-Hk7On2b&bt|bTg<bCgYQsCK7>mw=Oo3&fFAjKZk*VI0EBY9!4eVkw3H+=g$
z_mr<ty=y!QZmax>>%+W8K&t+v{eJ)6-o2jv3eZMn$g7bshE&Z>?N4d1JrH2*wPWvT
zp1pPg53?8Q7qGpRop)cPC&{6Upm!UMD~#O1p^FtA^uvz5d5>&H$yY~g2G>_rCauAc
zU%(EM(!Q7fL|u{We`D|8!o{V1ev8@=gRp}%BMfrv^sBE<K7SuDco~mZH6NI1Wmh>o
z1WI^hdy9YX2K4e!LvOAoszh at Ae9Z}tMQD%vHX8M|N}ryS_ha^9%A)dRJv+*J-syTJ
zr)TP3T8G|S?jlAp(6`Df(b?MmBqbRnf1<5)VqEm{ywvON=<D18Hak_0`hd~j=v%xq
z?~%?o`1=O5EANqKCTz}ou-_2Y#`Meg%+j+V*1u~5*uTv?t`C6}7-*xF>_x_M_w7_+
zC}HF^^pXvFrv8h(b-DX5?Ccbm5Wd^dg1>^W-V`+Sx!dy|!JzK^tJfgx3;Q~EVt11F
z$SmC~RCzD&ksZ23pZlKZwSfIn`%f6!+8?lQ_iox<nC4-ivPdpp?fDWdQM+gW=nn7_
z-oEKhvYW~F(D6X^o|FWe-Pr4~{@#t)oW8CaRgO+ZO>J>CDR29x6vl%~EADY$q9LcM
zzhtV@#%zoFqN~r7)jcjWhk6Wq+)W^`$w=6YjLkv<ty<-q+;udby{50xrB^K030^<2
zYQWIdp6O7-^AG9|TAFuY%{XK4ZA!ntGnLVmy_4(98GHBAoJS)`2W$VO>tBDqncp`-
zrYBj*7hIh#$06TXO8eXL6>8%|{j|;hovPDxvK+I;UQ7TiSGH%lV`c>^tOK`4%~z at 8
zeY-NUZ~S1GE4ZN-LmXp<hWNdn+hT}IePnBAk==e+1JToalrY$L!{Fhjs4x~NdiF-)
z$EnS#23AupX88AFv17Bb8*AA#gY<6f-k7KAp%OT9(^Vw-_NDZ+F6HS*pY?X`Bokp;
zFv61K)W|gQR9S2drta*`l^RVxt?#sao9~&FH@<7}N354SU{ad=V4Gfdr|Bq^?RB?f
zf_oQvZ({X+Q*rnD?=UQ!w44~Tc<sexXHKkouG4Kb`Mf@(58qP)A?1v{QPtNe9cbD|
z?N$3VV<?W3@^;XY2a;;F#8j+L$SJ~5hsvX^$+Flc^-bM{MO0gv<yrh$PisDZW at Ch}
zUk^y<X8p9kp at u*1;BciScqTIX8+r%ydeqyg8?|Ebzf at 6g7CZ;iU=$ES at 1Oz10n-z&
zp|IysH1>hNx9wx5=nxlu4PK3+`q)lIXrCl{5#>1u`!@8x)6hFr`G_2Pa29;!g^-3d
z7V>8fqj}giK_=TL`ate(Xh)+vWH>(6G-uQP9tS3~;&=(qL|;ZDf+#Ho$)#+g`xm-i
z`HOK at d7)tz&G)v^m?uf_o3*MNFp$?`qSykz+)}<_=xj=>j2`#bu<RCy{I%}dUiUYs
z`5XPkyVGd1Qlk at wN4UrRC8Ye)NO=JnFZiUWm2i{0nVMvbu?`%0f$QUh*A70PidD|^
z2Mq{iQ<|ahMSD9`>itT&-fNbp>@}g(%&)2A1;!XuOl_lurdP}^<$ay3Lx~<N$4c-;
z-(a!G_o(g9eow8RQ+JoI!z~9wo>P8WPU?DxwSMZ3@^#cggf8Tl at 6Rf##n7j;H<?<Q
zX<zM?-0+#{*T~N=>)GAo{!49S^+HB-I at TY$_#z(Kkuaq%clY#l?$-NMJ-4HbZRAod
zkAIQhUj}$fc<81}52AAA-J5cGNEyfUTuLW;==3tsGxhc5Q(y1Ng;EVGZy4y_R?=I3
zkO!9cZbVuSFj#Fsjk?}TbG7x<-2y2^F!L4hOWIf6xE#2AB^$d%+JE_WU`oHwzxLkd
z^Do at T(Eg>jbB9`NxA!g9n<bh=&@{}9?jXqPF{FpymU2v~vY|?~XfN$==mJuiB-{PX
zl8v<X8f$va1Rb6|;W)I}*JZHo at WN{H<$kPu!(fTIui9~2`3NFzb~>iK>L25b<v;NT
zVoG77pY1L8J1`Ua7aJh9gNda4|FQiRt?$Gc9ZWumsg7sSq#aWDdS$6in0L%(#Izsx
z8L9rfkMrx at q4O*4M|PJ#u36=>CnS5>u=V_*jQFs9ksOHadNdb>w0CTBhYIked+|;u
zZxvMbxUb^~wuOY0cQ4+Olh-}lWb1o!HKIGWEO%e01IB=ovQ6E<P);p27`^UhSkX)X
zQ#Zpzo7|mBA}zq({W&ytwmkqpg0qz+-Gco=%~z@~Kvv1e(u6j1zd#M<M}>Qv2f9Dt
zlG|}sq50F at nY_jr>~jZkS%i&LU+1f#HMjP;TlEIX+w&JC$w342qy8mdLErN{=<zrP
zrI5q3C+@<wsb>ow!WYB&4$TnARwKjPkh6T%U!Ff&I;@PebT8iF<bB at SW7m55!*m&4
z34<z4&uB=n%isk&Py}T}pM5!{&xAeD5*kC;bYU&=EUdOXXuJoKZK!jm?(DntptCmj
zUV2b at EfzXc2f9$!e?F<|8GvLpLdie>u8h{}_Aht;j^{8+85*EHW%;b<F)9akJ(*LU
z=WYOuxmmRjey6;dZp6W&AMhZu*GW%ZM_VS at +|9bVRjK<8#YB;y at xS4fXnD?NRYKWg
zQ$Nj*-uGC4(uLSw`Nn at V^$kNMYQ at w`e{wbO{Ef at wGOlRJ8#MQ0ECdUqT|&O<4=q3J
zwn>zSJYN5$dL<8Q1x_`9&&QdL_b{eKo<3U7a`x=qs}v?hq~u3J@~LP0Xn#k}hTGTR
z`b+IM^H=)wxMlw3%}VNRra3g5-7>bc{w<o~@A0E<wd_Cj^BT+t%FTwz!;MSA_mW?>
zD33ee-IBLd4?S{Cha4B4Ie<%_!_fMT?p{Yt8QaJfsz!HzF1hix$Dkb9Ip)3><0N%<
z68)(-oBe8X9pnBeGNVcPu-e8D571Fd;2_hdE#H>%-l=F3e$-*Yr!Z+B7mqerztT-x
zj{cOp5NZ$WFChJ9MZo%_n%cd12y4w1s at 02kt4GA;=X7E=cq2y_=qpt*P2=vXQJUR%
zb}xRlEN>}YlWjYmK9nFeg{UQFXxleoGb{vVO7Zx1FUBp~GJ1Hnt_&X#wJi^^vPa%y
zxFXqc5zSKWPUGpwMDXgXy#lZQ{_($Ldl!nTA>IrAo7}-N`i?bK#b_s8{@S0d6C;q=
zf!^4?jh=~bbYJrBZ*elk=@j!GVk&_}7PIAsp+k_yG)6P$;6+HUrR{-ljb8)7v}9nR
z>i)J;(z&KQ+PMxz`?Yy{-PP!S4R&wLks0@?409~*#m15}m})B_b}g)_?kM}FLW;W*
z-m*<kv?O)v+VWowf>;l-?)ym at sXvD>a;%>6!^qE64P;R3*YFNu&9=+YeWp`)<3Vi~
z3dkN|e=72fZKnFZQcyOd{#Ce%lea?Ic9XjsGMsI6b_bl3cV`OxQNUn;p}H^^Ugqtf
zX?W~L25qFp81VVSj@}LH&}YtBo4be95mn_c*-r1YD2j^SvJ=p%P40Fz38=xntN;j1
zI>>WydEa9i>C{8f+g+#|hdw2MZhn7{8;9SkZosY3A0Xfl`ipmO at ic}8gpop(gQ1Sb
zb@%6W?>;r}&d-6?6Z8aK<vlUdq;$~@y(^AIy=DD*OUEHcbwl4Wg3hIrOWx8hKblJ@
zy(^%~DifC{$^JUsCCdqltk<}-*C|6_Asgpei5jxe{W)C}{am#L*7*gj^9%jOv(C35
zV59pCHnEGJG3A~`xo3UlZW~_F(<I<&BY=j6K6iQdINL5%iZ&3T4W`_aDEFjK(eWtP
zu*t>79-r%p+)wfID1LqdXWjC;_F*77KytP{gZ at +Svc;;!KczvtI{9*q`bN(xl^S^P
z8X)(j<OAuvPiJd4QPT%@W5Cy%YAaE#$GGN-#rWFUqw;7o_UN_?Kx31u)-<qrGjGL5
z6q}#qta}sAa*n$t>Mredm#UW^wD+Mu9xq?<7`>&EWYgy^#i%K-#YSJ{q_SGJ5QS-*
zSia(6N;0=)^9f$@ST!P#;bR(nm_U!`Pf at B0KvE$9C3MBZg!|OZ2E^(pY1EYyyyCGk
zL`iiTe3(EfMOjD5sO?;81e9t46e$R%SkE(@_WY^q>Cx_r at hE^HzAT0qH<XX#x#ar1
zA3sK4x!$s3q``$E1C^L`FGg=ZzR>86p*wi=62zl-h9AwP(Yu7uethh5e~PzS&Z5*Z
z;=*q(?Sk*+Eu}+FL*Fve)_}uYL$6A1TLHLS%r at SDq=)!QpnCghSQ!-uK!(~6Ku$pC
zXMk;5HVU*0kb>>bMl$b_hdr|~q|E8Dr~CNiG}))>cW=(^Rf%mQk-H0{F>fiY9l>(>
zTn?E|-(#d(NoTE4I1*5K?_;?p2Nv|w-W0-D!AqW^r6jom2G&1pHb at ax<UuU-Rg|P-
zY9B8odcTR)4JR at WQ+Ie<%Fjm=4nqa&4wMe;gWJ9Vj>GOina}%spK11&wPgokqi|8u
z8s7f$1JS?S;TJ0efk1!Ig0J at YusQaEedbn_HrsTEv%>2b|JH)KQSO(q;UfV?9yTNR
z>g~ex&ZIz4k at fRwzwO;8nENfudm#b-o`O`znhEI^$HG-Yz_AACHeMYuI#rXE_ha9O
zgGDZyGst_r!EIlqap2ow1^9UF-!FS=dzkxt{YW&QY{&ZWe089q_a9|F at Ak~XD<)Iv
zY3gouJO4K3;&)+ozKL3~_zeJhCye)1VA<?mAu;+?{|G+Gi>vJ3zp<WQ_4CU;kN?BW
zFd^r2IsmEOvF5+`xOeG{$CC8;5f)3~Z|+ZV2b6O7po0nBlk^>?&yd1PxxU>$wF>3O
z<|A0X69ZuuwhaC^-*gQwU+wvS?<{p7qg*<WsaRhN^%JC$`bzDkUG7Ufsk2vaVb`Uz
zz`AYpzAxU2qok73ha;Fik_{s6SDs8eE<OEnT{YDLn}E@~8N_jtn>x6602S7z7NjFn
zZ?2-C&sCVval{Iobd#5<WA*UCcoHwgaQn+r+TS~M&}%#2cbBi2r5Wm1lIThTQ<a`#
zd^w;!b)ftRT<WU(vs^?V!I!v7pP6gO)_t?kKQw3I0^zJDaa&p5d$f2hC8a;XCB<)&
z*A`@86;jF at e>3x0M*c!~O=<4;|L^>6JHO)sC~$rsM8Ez&b$$<7KMTzKjB&Grt~tcB
zHF{?Qi-7wE+)vDGL4VEo`BwM#GW9(8ZCdfMs^Ec)nt6KNt$m$avH0jq0X-Y(dwbdm
z^z-zz7$dWjzhEHaF%5Ql7>EN{nz)X*zR?SA88<v-e3|t~j4N+^qndg at 33t(0K0UK>
z{Q}Kjan~>Q;H{*-PI?VfFPw(^;Bt2th9lSw4#O^KRwJMK!|aXC`?iA{4|Rf%v%R+n
z`H7BsRZ>0A9_*R=*7B)uspr``Uro at 1-P;b=&$GQsc>ji%d_3sIjE2#We4ZWPm%6|3
z+t2G?n!sq1G%x#lCl0#i!K_JdH!D at RwONd$d7_s5<pTW-iQgDP`4_qT=Dx*9c5XIy
zrb(Etu1r5`?@aD at U{6p{8%l1b-k|bDHaXB?0jEK;Wq)}4ORWFwioEb+qsL=|%NKd$
z>3{e0Jq-Q#wjBp!Vztz5RxfhVEsp&yEpC5V$~$eeH4i?MrD+D!$}W0%V}C455BW at q
zVIK31(fkIesNSI%Kk8FYW^vWDi&_x+kPr1uVE*La7_>Z?F*G*3cR0K__(&?XJUzea
zek0FXQfu(aABg8sbo-<nF|eQY=5v*eKEw2+lD;>f%o8H-{Yn2LBE^@_qoMQRlI^WH
zv>TJ*sDGdOZ3uo!YJU!(pPMeDl62AgI&Ka_-`XCh?@g&JPHE~^nYXrmWd|KhcaNAD
zmMnhpy+h$I4~9H==tT$Mveqbk;&CdCW$sC6enJmt_27Y*8G89l{9cy%l`{IZFTh8;
zYv?tiHR$Qpx~@rhQ=IxWIPdORm8N>Tx6M~~r4Qmp at jlvOcQ1a?$@@wO>61}lqx)cv
zx^zFseX+M6r^zFF-TOAVFVbnH&wa6bXe at 3Od)<5R<N+^*mF3+|0}I5=h2}PW<?nS0
z%9&e#wc#-)S#}F;uxl~J_AXmV9S9M+ZG8``@4ZioH`GQvS>F9D?t34jT5LXjg{dzc
z6+{j0f<7V$=0xtpik!N@<om^KqcudAnaR6LQ;VL&sLM~IV?MF8+;nO@<)%})Vec|=
z(@BMlAiB+jAbF`t73k8SXW12$sV`&Y?P!s%Ly;1r3txn<%T*P%^Z(`G@)ZQ(nltam
z&*W^@_ha3Q?+NecV=#WWkMHSSb~4GMlMt>)4 at J69AqT7H;<Y=p&pt*OrYUrthNe)Y
zAlQ8w-Gmw;e at DC4QT20_BwRtW5Xvw@cBnpoj3N!`!9ucj1EagzKO*jP?}_ at P5NxCj
zA`L1Oc~lBO1p`1m1bQ>?>=krJi0`wyFq-Zqo?P-8rAUvGOK3*%hU>EDNo;NBJ<OZF
z8jS;5BSq;odL{Z%e7XBzY!`Kz-Zpwem$n~jAkd3rz0202DAA`XXxbX1C?$Q4Du&W)
zsg!uquK`-*mX(s+J_d#`VX9=ex--Sq^s&fJUa#t3p#@|f`aY~AuXt>=(C?+S(dbE^
zVX6>(!M140Q#3!TJuI(3 at Av+-D7nAK1`wNI?D79dTQ~J!mG25@)3t(Me_HPT5xc)M
zn|M5z^wv+cCYeXH$%C|9ujpQ~d(iym2w5^^KZq?OHYPallmUldnRf}%FSM=0PxyJK
z>P`HDshiI>PX4Cn(bo?GtH(2<qnifs(MfB?;(pZ)Na<f!#*5AQ^n0NBvrstydKAvD
zN8$V?6wdFY!i)2jpYm_4t^+cQcdgw|Re5I-(Vi+24!H-GNd_diKgu%FuYM1ff4w{0
z2vRE<BiVFp?laZLs>q`1cXb2ymRhJ(iWai04y><enD((M>cN02sf7w8wZNB2!|?K1
zG<k>=Eu~UkKq`|@$}b4;XVHU6(NZeGNvSvo8HvPK%EB)^aAp^NhoOs}#nOwCWwQKR
z{JYq1k&h<7%vN7$%5vVyn==(Tvo}zYohX7|+PL}|`a2EF=t*4O at +0xN34ingTc>UG
z+q$X<K2&W>Ih5aao-U`>QOHheTj%6rz?1m0yf1IeyXVQWrElU*I{ak~?4y%SMfG6l
z-TX{bQT)Y=j`;B-uitytPoH at 1kn<luvvTc4xvO5AJ-+-~Pee~Y!~MxgU%TkD8z=ta
z(XT%6#G8#{p8UZv_g~a{!#mUe at aVnj_dM_d{}^Z1T>Kg5&Vt&;n&x>0_ at j;O_#?Xo
zvC`48(PIim#yVAQ>@0`Y1PVEyobKc-%+ERL$U}2iVlt<VA^lL83adC5e<=Ts$fTnV
ziIyLPA}Vj9%5 at z2lK@`+0?O5O=Wsr4#l8G_$^1o}f2z#CHkrSQ^J$ao)xSQOzn=52
zm-&&|s=&#>wsQVuDfzCKzlZZ_|LVz~nUsGr>_A}&eVFz~7J2n2a{fs&|EgsDIh=ok
z%#U2-)nCN<XUY7!Wc^i~UqZ3bA3ZOJN>KaOb3W~Fz5GP7{#MSXr^;S_q{q|0hx0?*
zAEAnlL#GI@{*<KriJTu+e-7s>A2a-~NY<ynwLu}wKUQ&mn0?oC{?R1F$gfPw-<qcW
z9?qvDgI9lAvi`}~&{DXZK8*Z`HUQawBIncng_mEItUrhIt7LxUa#eupFXH?#`>o>q
zk+ObmQvQ0*4;w#QIiDs>k6%l&{vOT`Z9k2Bgc0;%+CL{*e<J78i-TVNmC5`$oKNRP
zFJCub`R}4M^;dCzn0?lBepvr(<@{5memy-X{d+h+Z2X<9rgp~(^RJ1VPcLV9?W+;(
z=X~mGFaPRf{vyr~o4;0Zeu1oiU9$dq&JXjCt(;FwoTq<kvi=^<FP8aD$^4TKp$x}S
zi=gTM@?`!*&JXjSIh?<QgqZqb{#?ZQSES@$rWBCPR&hSPe&f||OzK~sM*mjMzl`QN
zqd)3;^XDGUr{DGW^4pT~PsRlf1-d5m@*}f6eiJ!AY<-!-`C;?dBF?`uMZTWERQuED
zU(fm1q}1077}ej(`S+*f-{bM$!}($SPNsz%AuN9)=ZB4tIce%I;{34rb5)x9>pA}t
zB5C?}akBrma(>wSxQFvY`~NT!>^NcdCvyHdRLJOWNa~-%`6Fe%n7<cse%SoDiu1$l
zyPosI=I5=PA2vVlNu&Q{x?)5Kt3Q$RPoqMn{WpmAbAH(Ty@>O}#@8y&KSRo|PRd`;
z`D<i;<lj^Q<^P<2Z%Y13FMki`&q>MG6ExL7S#5kA=b at DRZ+Z16a{hHG`FaCL_2+PY
zUrPQRUj0R!AGSWN;{34rcRlATH#YX4q{OTBZ7b)WEA`J$=I`PBvt at pLGXG?pawvS9
zK8*ai$^40&Pi^(`#rigf^G}rddivDsKj(*yuT^RC*K>Z@{Ir$x!^X!R&YvpzMf3uv
z`d=A-nE8+MyHo0aS{X?7f13J>IDfLNe_PVNt2m#2GtaYcMKXUq=ZCHDTRH#JvVP=N
zkKZ2758EG}tQ0#=n14>>{DzeEwZ>at=WzbQ6#ea<{zaTmuZMc=)5jx+=FC-`KbzKR
z<A2ffA`r0g$U|@(jh^nEMK^QVWl~n8%hNT)`C)5iF1@>j5H{D1<op6EWVndESUKm1
z$)C^p^k%nLf4ZWl#zGJ0Pm=lCl~n#3&JWwGZRY&2u{^~2%ccCtGS5D_Y7ugrF#RJr
z-_}>;>*btZmm)vz$)C^pVSC*k&JUXl)}+bb%=raUzv!PK&ZjGK&;HsKm3?#7BI`I|
zYuQN753_GM=f5N6M|LLL&-qO$`Fb)Y`}J`CD48$rO4e|Gq0AR|A)7frjQ>!Y{9HAO
z;IVtiSRTpwVPm<R^HpTVhd67@=lo*HUz|01I6rKzUc>pHlJz6{vWe`sne)Tef+5Zi
z+l%C?LyY5ujo*=+e}9Vp<X*a*^TYJd=lt%J`pLDThx5y1{-=`j!y3*XE%U`)&t}e7
z&9vBmi1WktLb-eqNSB}9_=xBUitIa*^M8<%|B&b3<(z-6%olr&`J5lNm+0Yq(&gzF
zdx<riAGVj+%=uyS#}MaNN%`U~CRZIo94E{_Msogl>C9~G8(niS8GtyKbAFir%;)^-
z6#aUFqWV3YALc)6IA29Zzqo7K%=yJqz8GIaoFBG6=BkCyaXu;QM>@U!9m)BvDfu^f
z`Q at D7kdm)2L#Tc8Ie$({esZtf!}-%w^7RBl_1AEIc}l+C>{9+_&JVNC5a-8aeQ}qN
zs}^R*3A5iw&JW{P&iP?`@%fyumLb!B;x4RbnEV28Uc>o6l>8&#Q}!o_rfu3L(UG3g
zbhJC1)3nVcjJ80u4WKkF2gh)lh6dS<a8h#ur)jK_KN3!RKwA4LU4WkvoTl~WbWWei
zG40*X;`BL)X-%VgG<Q*4gdge$N{``qEaGvTrZt-KK7pU}Ieh`*3prhe_+m~^;@Cwz
zh129CRR1!LD><IVG0g#2a(V{hnVh}~@ztEBaZEC<!w>l^rElQ48gUJ$YZ23)hw{ny
z=W#lL_(o1QBBphp%Cz8z#t^03 at YBxePQ(j1O>69}oc<KYixGdC(_M&{aGKVWrJTM4
z at n<;Qi+DMw??k+U({~|W$?4B?d=JN8K>S5c-;el9oL+_a0Zu=Ncr~ZLhWOt&{SCz5
z<n$Vjzs>POh`-C}?{oYEj>(_N77yd+$DCfr at uP@;%IU`tKhEiX#2YyMGsI7EdL!aZ
zoPGxJvz&eo at y|K^JmLXPzrgW}9RG&nEr@^1>6a02<@6sA|B=(LApR4lU*-5U#IJLD
z8{)rmdOOE&ApSe2|AF{TPVeM+7vi@${SM-HIlTw*zc{^@WBQZ-WVa|j4&wA79OrU;
zIN~EXeH7vkaXO!4dMSwhOga8J$Km66PM?7Iqntj8<5LiSjMD`i7a~55(`O()lhY#+
zpT+5O5Rc;YXvF7nI)=EI)AW36ET_jKF6Hzm5TDO!`W4v=IbDYMVopy&Jeku|5MRpa
z3dD5pi+ at ffKBjW|3dB{Mp3d=1#IrbkHR9Qvz83LyoW36M4V<n<T*K*Fj_VOOaC$ys
z`ZM(S=iG#kMou>)ZsGLJ9JeFx;Pe8-w{ZGa#GmB!BE*Y1eH-E~PT!8Wo72k>-@)k~
z#J!yELwqNvKZ}_DWIp~m^qcFS=kz^@@8$FtIldq9mpHu&@dKQGkmIi+{u-yhj`$m#
z{ubgjoc=cA?{NCN9Dg72T2B85;vaJQVZ=Y?G`*<y6HfmW at p?|v%X{>I8~>aQ_;`}j
zPa%Gq)0;Sc7V&>_n*QiI{y762zd&C?!c*8l9_0LkhFKs)>F-zi!hcO7;2<CBIL|Sz
z_5&h!9pm`|qvY2aKf#BuWW2zK|K{Tk*hHKvR{c at VKjv(Ww=zy^7KOi at rSSpAX+5Cu
zRhCnIu>Nl3T#1bU$?syEuL_+Tv4|7ClJR48HRtsrjjv^V6yp>{)y_l!NS<1v%MWt-
zqeXU9?Op`H$*6N=5CjrcJDXruR6n^3WJJBU9+FuL*$##(?O{xas$G%!mW-p$iODaV
zS{2jn>OVvS9~LE|`X)^vIvI5e0vG{NeH)RIH|-PT%fHAm{!_-eM=}4hLp1;zUWhtp
zdTE)6s#`XR=ycSfU0X0j)eg>vqw_5xs&;q5jHpAqy<mtsr9lu#RP6+VSyAVLAgIch
zh2Uy11 at fYf8^8#NI<%7vhN#+E+VILC2qfyz&NCRIPE`;D5_M=-8VphAsvrm?s&6d<
zc~P}{4ZtXGW`)$es8gK^@VBT_8^Fi0zZJ2cAHyXYKBoP at it%E`AHa7tZ!mro<F#jK
zd^O{#<E0^>mw>2~2*Dde at Rks~Er4G(M)MhDKB at B(?b3rG>MRU`K%x%q-h(0PEDnM|
zqE1%`-W`J95rX%I;CF`LcZJ}e55eyV!M_lK-yee0$s!n{&VxY^NYtSdM=(U4Zv;Ug
zQHM?<!4P%mWD*Qf{iH9D5p}*FzzB#sKM28p7=qI&Cm5p6x*!N7>d*-&7@`iHhJqpL
z^anv8QRinN_{I?YnGpQB5d8TN{Dlzw#SolMV8IY|wgf>SQHM at v!4P%mq!tWO=anD`
zB<j2xg1;7mZwtY<hv0Pb3x=rkk01yn>g)``>4X>zQHM^8!4P%m<QNQ5$GW|R-W==B
zII2#WHaUlcl+O*p=|ma~QHM^a!4P%mq#6uS=a>L~1U786zNiZ at R4EYEqUv-j08U07
zI_U;O)S**vFkt@{f)|9~bV3e>sB=aLJ~9M9Cj_SxbudI7I$Z}t)S;7hFhm_Xbq53X
zT_N}dA$VB`PN(!>h&ps)4~D2ir}tops*}78r&E0}M4jm&IGy-|A?ncSKNzA8-2?<f
z)VV$cuMWZKwjdayPJIYIKLo!i1g9H@)X<3y7>$bpJ{KG#vZD^&LIgw9p<9Vyh&psb
z5e!j>ZYzQz>d?(aFhm`?#R!I|b9)HBEClZf!TUmRx&aA>sJacY;dC<+3{i(}NrEBj
z(5*=@L>;<82?p$Q0{Gx0DtD0eWEOSk)+HFC4&A^6L)4+$m|(#ECj_Tknqa{GCj_S(
zoM4DLbej_l*yn`cbjuSAQHO4Pf+4DIe{4A21O-FXp<AKUa3r5ox_De*n5Zy;@qWgq
zGyXN)Q>by2s{xLj!igF$V!)is7c*Yw!+)*%cXNQx2FCe%hPC3}oN=SiWZcjAkp>WP
z)S=t2V2I-0z>mIgE}jELlJ{EbcRf((HCzD{Mh@|cpU<f(9Eb`dhk5uYPTc__g^>?=
zcp;}gGmh{iOI)w1aGENgJY8MS_#7Mz3nRz!H~kEUzI+A!QW(Ld6o(}oKMs|Jk>owh
zNsQlw{ws`}Vv4KhQS{|fG^sFhnum|()Pv^%cM3e*tdt{^{4+h=+>d?bWRf$|!;Ri=
z98P%hKIC#PafY%R4woE`<G3CN-oi-oKE&Jz+zm%4j7&4d^>b-_dkzzLVdMf2H}-rQ
z9aI=8^Y9W*ZGqzyMrL at pvB+W+EsPj#{5uXPbuBLX3M18CdDAaPpFnu6hZk__NDwHD
zT<77&Qg_1yg^`4ZU%{!KiwJM<aMO>Il)p84c#KnrtA1(m at XK;^@;@%dGsp<t?CM|=
z7sK%iBgysgQ=ET?>c`~zX!`5h=+MGQave1N`(b70<i5z*;i3zvd~$ufhUH(4aa0&d
zu8;RKelOy}NOFBNdOtay at Z|itjLUxvE?yW(j$d<<yjb}gjKsmnxd0tn7%^Uq=Q*7J
zYh at q0@z%j4-i%WDQUiN_^$zY83nTd8U_5&}hGJnvO*SgW$bSX}3L~y5?u_8nFY^h%
z%){|jg$;0w!bqiuAJ3^pXAxc=z-NMgVdN?gM==#Pf<R%!Se<`2135P<KfHlUC4YDA
z2g<M7JRD|M;VI?kogRKIr`}Za(n1e6Eh&d%6h at 3U@jIa?b(I?Li at ow!a_UEFymxuH
zk^eR2KiwW~TJq~rB>xT%H~zB~NMWSc!%e at xjznSP&H!Gf=8by-_yXn6Mk>G4j#Ag3
zPjXgy<&Aw7t9j!=4~KbGIJAt)e}TU_&Ly1MsqFBmhnw~uq3qx9;cGbcGdN^n<YyjE
z<AlOZYM$Aszu<EYr+#%dm4C*=AL3M@@`vX<{5np}IF8CE&xz_CQsjMB+3hZ`Jie;%
zwi;ib_i$8K;oeV>oUeJf8G&z=5&jJi$8b=g7Ky@$VaLBag`CA|T>OAbIgS|@Eoxl+
z(8JBR_}b|t=V1>&jZ at 3jcr;c_${DHfZ+qq8ekz=;#@F{f+{i!l43fXj!{Od4Jfh~G
z$2=UxR5%#xui at -vB5|4Om*+jGk5iw*gisiH!NW04sqj{W at GTyWX;X#cP9*$g4~JM4
zj!^6PA3XfyoO)j2uXs4RPladmNX}~>Zv6IRqX>W1!=X-vF^3Vp&BIOkch!3F4-cQp
zsn^u_+UeoOe|~ug$=T)MW`w_|)`fRG+>~!o{kzA*Cv$3{8W%5m_)(nto#Oc$56|V)
z*Q2C&yN4V3UsC=1hKHj&R46!{%J21XG*g9c)$hssA50f2{22qhFcJkT9X`$R4+{uC
z+{5YFH-&FOQDG$4!%e?;j3qpIUsR%tJO2&GE{r7af6UB#tiqG`KX5-4URH9B^W-mM
zeEtO_CwZT94dY)^{+7JYG5$ZG$|vu0O!?anCOOIb9GFFgqY4Qx^z=?*+`)ud7)jpe
z6f=JOIKq?nHp>~mQsE;#ImXX_JA%rO_HZMg{wIsVNb)|%*x_y%wlHFLsrnrxQgNo5
zXOj0e#-CrFi17<s_`?O5uWb0o!36tEkB^b_CuN`HJx>7(ECpfg6TI at JjE`6LN#66I
zTUEFYj#?N=-owDWDpZajJb4de%70Yh$$J=sPc5MG$$J>%hq>tJ!btL7#q`UeYQCNC
z at q}0vIxeB|$$KZ$FHz-h$@`@yF8|*bQ+ax`L<cj%H>>_l-aBF1QQ<QnjD59N-uU at 9
zm0u<Ah0M4sDWdYpdy+DibJ#_MC+}6d8DFH<+1ow&W=A*%eO?$z-k;pd<$req;XPjY
zs~P_?8eSMl-k+Eis{;9jk>ve at dJh)Zv?$@pdlgJ~D%__0_H&+mGjE at XiK{S at yjL;v
z_AP3DPTs4S`MFokd&zqh<A>F1o=@JZRI%O?<)6uWm5(xhkQ(pFdlEC=e-D9$k>ou|
zHJ2|`{`qZ>znNE`#)MoLN#2tj$K~%<{+7HaG5+?N^8e(0$S1k{l^7sce?9rFjQ>>O
z$@`E7#!sI><&*aza~Qu<;Tt?T(;0tD;mP}ua>grSBqw<vV)$b@#d+!lPY#wj6^y;<
z`*Zlg|3IVyrVSMc|2cjPexZid8!^8S_~inp-GV7!Bk%!%-y-l|3j6_qzaa2$3;b6C
z- at -VCf#PKJniZV*(`svQvjP$>d77D=@L!AarED?6rCu|068;-e-pnM#2SdlxYi1<B
z-X$!b%2OX1o?jC9vjYEyz)6-V|Crz at +jTMtMxe4U;m4G}G6eq$;}l`b<CXudz-7Dk
zhTtt!2!XO?`8xzI`F~5`vi#3N at V^QiQ-a5n&c75$uatixeL?W!Pccr>>-aJDiIPwR
zDo^%5lD;4u;lq#Bu=8ObZsxs{eE8{H{xlzM){%33xY<XI@!?qRRha0*Co_I2<J7Jl
z_%ZETCUDyQ8vJN-J_Kqn-N6|Ac*aT2<pMYR6vAl?n(`k<ngZb@%lwUkqX|SH{I~cq
z<+06Cf$-nq$KbO3%OUtyfg5|1oZk!FoaYGtgTRZBra<@~@niIsAx(ksLHrop%%g<M
zd29^6Qy}~m{Fw6l!{<*z&c#SmAo+jBkCA^Rfe3`ZiXVf|BM^b`zu?E<?F1qa{u+J^
zF6~VHW&Y-Z#KY_?*NZbymjcO`_Bmgn8viBIMz2}Ns66Sdqy)lj9Vu)GX;Xea<5Y}J
za|Z7e_+JIyBk=zc_`L#`e)uhcOZh(%xRk$1;M<Wle8~4G5Kk$eWKtmfZ}>6gcM*s{
zxRn19j5P{`(<#rCHzyau|1NQ)DG(mkzi%SX$dUc~mcR>;ra*nN6F;WB883v(evxwi
zDavQ-|8g9WO^n`rq$yCl-o}r?k0B6&@ZI<^xYYZOz|TgS0+oLkKc at Ug2t*+Kzwu*m
zDM#{=Jokw5c}P<rIdYtu^?~qz;m0QbJ%LMmZWXxXlkNQWCn2W{X$r(=5I=^Gw5Q~M
z3d&HR@^XHjtk9sGmJr<Rul;tA{UYrXHt$J0$aP2d<6dwu?J5v_9E>T0j}tiU2 at O6~
z;BsD?Bk&iHHsuon|CPX(3Opj at KOk_rbun_jD{v|QQGrYOG?!8!dmbd at ysl7<OZjgJ
z{9sX at Y(jzLNcrXlgK)alF+9yp1>uJZ{6u`GKzOdePgAJI$)4tK_DDZJOq7>$==RCz
zJxj<rLg3~cOnQ$LxICxO?URus*Kz3&V}$$<iSlPMPTb{uX#9upu=((Rs$Ft^I9;?$
z?ti2`<^Ja)QC{vJ<oQmnFJ at f(?QouulPBzXfxzWBog#4Qx7Q1tZVOF6wh8=bfqz=y
z^mM?Kzf0gB7P$1oV?%JdjWlv(`QrutQ6Zn!Zc|?R=LrIr^X*3kF6Xxs1%9rOPq&>$
zzT|VVz~#Dmioj+0j|p7PpQj33wu|;2HoXVR&qoOPVdGcwl<hrL*oSU$4bOUkpC<6z
z1b(`}<@|7lz(<JkX9`^Q%f|&S#}Un?hR+Nk=PZHG6ZqKzzg6ILM`7gj30(S#<UdN3
zm*=2!1un;<#0!LcX-~Nim-f6}l$Z9D{zJFlhQAya=LuY{zeNH!^Dxcxv<4YDa-Jy`
zxLlVdAL;*NM0wd>x^*}5PZ8~s<5bF#{+}08K5V>);nL5;#=GP%*G=hHa-Ns#W=!yu
z^OfwcvBD10{^JBL?J!>8BZYjrLo at a)75D^!mkIn60xuEx^GF*x@|^fA at PF~W<=vvZ
z^xH=Se!h at TXH^R1R}z0kp&Fkk%F|hl0+pA3RX`vD;j(<KLWAYyenYO0lSFyCV>IoR
z^Vo#~m+RoKP}h{dQ^>hUlwTuoTC>gHNc;~UZqDP<Zd88J!MY%=trV!eaz9CH2L-|<
zpGyQ at Cis~9Qh)h`D1Wgi-z{({XQjZU-tP!p at -#Q|e!b>~obcb_$JlebkR$cJBXFtr
zSTYs@$&q?T2wax`q`;*=^az}6X!v|d;IjO;1U^ZWe?s7s1^x?xOF1tK+!f_VMpd&^
zyJYzb1TJ}ACvd5^Rp3%ix4@;|`voq`KO}J3-e(0a_5M-dQjWPdrv8$8k01bnB6`Yc
z?Ef)=Un=mc1YR!iguq{dEF-5=;AC<$zTOqM^ur5z{86?X7Y_+s?$6f=T;5YXFL2Up
z=7G0E at Z%3bAq3J(<<0zHem{_Ksdu6%Fa3NJ%2A;5Ve`Ml<$k6@*z<HHMay|V&NWgm
zJq0%Xm5JV|qP*M({so+koUrqhlrPuKv7)`xgnU{43W3XYT&^RsT~(sIlz*kbD}-K&
z*9m;OC@=ZU5V({t`OEzSy)9tuBloMb1TOave-`$<Oz at F*zDnTIKc(KSLXJFt{a)ab
zkMuWlFM1Hl{!x at KLz)8NgZMG+lKZ{gS>m$21*l7b<jZ}I?DwmL-Q+p_YJnG!5CoDl
zTi{nHRO50y_6c0>e|{8#%k$ASLjI+qyu?oscv!n6AK6~XC)0f+{x2LA?her__dnMP
z{&L?a at mGYL>qPlX_KmW9t&k(zCC?+$4j&Qa-$mN^o3xL-Pm}v|c`lIs6?Pt%=WaQV
z%@OUb5qz!}c$2_y5V#zVw7#5ts4ghyE6M+KAxH9;=N)+;DCc3xNAj0^<oQ(Y6XbdM
zIdCv`o+9}CR^ZnQJTBTR_wCgJm-kmS0>4Yh3A?|VE6U6Bpxi%PEab?0(VIhXX at 7ZN
zR4B?zyjbAUPoy8p`(Y`+R`4Go<kSgV`sa%%W9%>cw_cQ=A>_#YgS>B{cgl>MTSfVK
z0+;jNJ3_DIKVOtTN64YGjFB(>N8+c6@^ap55%_6BPOre_{2<5CKT*c$EfD49dMf=y
zu19&Iyu`!SuQ0uGJ(c#C_L1x97@=1_cRN+!@*YX<C!a?d)81*K{EfnH*9rWuqWl8_
zmvS}>{3api*8-Pv{=_)VVL!l+;d2n`QlNEqFMbS8V~_&jRL0=crxXY$S!RDudo2ot
zOP&u3T+UZN5x6|R3<|svX`}a0QieeCWqVH)xNPr50+)Jch2S>{{1TzJTi{a87X&WP
zcRv#NWFcphz@?uI30#){Z-LA5pNJ|m1ng|ivs7&eKgMpiiSp8JpBK2a+iJ%Ba-I_9
zrJR=pF6A6f9fjcUuTumr%U>#R(rwO-iv%w9-X(A;=LZ6payAQG`sZ$eOaDJeoHv_9
zKVBtpl5hOq{2eP2N4PnME)?bEI=)oklE0j{&k}l1hfE42{|JE}sC~u9m2 at p%-ZRVn
zs$4hazC!L>rVBZ;y>cFI7W_{U<>kIHA at CMa{!;>%{d<GJTSfU+fy?qU1%9(A-ym>V
z{xX5LiSly<F3Xn*yj_%+_L1eyb6e`)4pF{b$dTpc`qC-NUl>w8mvNG at K$JKBM)*ej
zn0cd6lz&>_)dJrn at NR+A+GOOc48aQoPP$Df;{;)O21H!VX~K;!5!6K=2!xwG0DWIg
z9|(khSf%twKc@*lmeUjsa++{s69vIm6bPpu^LOtG1a8haRF3XoDNtE+W~A at 5m!@Fk
zn7@<PC2+Inp>nGQZqCSr_Y2(YVF(`-IO#BdZ%--2cca(LQw0PFyo=|fGJ%`D4V9}F
zI3>;B;p_6@=I`*W7I*;(;LtB{N)EE#L4l7jg(>Fsg|Z6;ULf$(1YRa^GiOqEwZP91
z<+}uarodMV+{}%X-7oNwqWqx1&77>VX+R?o|Fcy}e-sGZ>^Ui?OyFjW6J9NFbJifd
zOW@{gNcd`jpGN?Peu0}iBFY*Rc+3>0nEQwL7Yn>V;AU at 3*<}JZYX{-g0v}5Nhc1De
zIgqke3w*pOjJRLm#7Vsjj%d(_oBLmrPi5)H{C&8BL+}N`;4ArinGavh->U^aQCC%;
zT>>{}D9SMPNWPhq3GWx>DbIW#6!=9{(1b&Eg36W|3~_<LFA;c|z$Xd3THuof-X(BX
z;Hw2bMd1Aczf|CZ0yk at x%I1ZK_*aPX1p+r~D`lI%TS4XN$JnJ>l)sz`a_ACxr2!GI
z7Wh<w_Y2&d#VC7F;8%$9=1+nWmnwl59Hzezex<<61a8)P%B~i;Ir|gdC2+GRA$+yK
zXA!`mU*P5rnX(22ezhq~@!|Rl at tm!b>XUvSlLFz_sFeOF6Zo|PuNJsDt5SBCz|CHW
z at YMppo&XO00yld$${G}S+!Utx2>pe4R_mntED(5&z{><aSK!qGuN8Qg!0QCQTHy5p
z?-zK3zy}3BPvGVkG>HFvffpR9zYv}fc$vU&6nM43ZxVQyz#9d=THxkxld}5--Ym)w
z3cN+&`afaReq#30lwEL?E=2rq)=BkQCh#_aR|~vd;9Uam5cq0=cM7~;;0pvkDDYbZ
zZvI3s at mwhIf)D90gqyoHLdpdGNnJ#JRtwymK`5h3;EP20)dF8E at aF`6o4~gVyi4Fm
z at +O#gE)n=Bfm5Et^KYfVZx`ho1>P<2I|aT};13CWnZTbD_#Fb at F7VF?{K$O$g*f&I
ze3Za at 1zsue<pOUMc%Q)U6!@J2e at Ngf1pb`BKP&L<0>4Y(NAh8V_^%ZBD1m=Y;FSXZ
zyucd;ez(By6!<*?e at Ni>3j8^Ne?j2e1^z{WA9;-aLj3O&_$YzjFYro%e at Wnt0{^nW
z?-ckdfj=bh2L%3{z`r8!?E+sd at FPF0zYzbg3Vf8nzb5cXf&ZJp8wLJ#f!`_cZwUM$
zfqzrr&k3A374W#*F7R)O@<$%4zfjpV0v{#te;0VAz`rf<MuC4v;CBlAA%Q<6 at b3!z
zIe~vq;M)cMeSsf&oc==m*9v@;!2d(wl>+~vz#9erBZ1#3 at P`HdkidT|@aF{nh`_fC
ze4W6LJYIhx{y!1;D1kpJ at JfOIRN##QUoY at G1^$@89}@WE0)I~6PY8Uwz=>-a`_+*j
z(S-=3pDxBn`S5<mD}8tY?`s=<xcR$rclz)$F8`3gpV1W5=g)JV&-q2pK+Z37a$m^#
zRnB2A=KLn-*ExqDapX%mM|~*&w>iJdc{%5wIlFRxpYwLkJ2_i(26O(HvnS_YIj`jW
zDd+go37<IsFFCK~eDw94uE>(e?UC-t(#W#NXCfE(Mpi_YNA8S#F>+tzfyh at PUy6J-
z^3}-7$mb$=NB%AH^~et*{}K5{<eQOiMb<>V9r^djcOpNGJQVqE<a?3tN7hCjj{G?C
zNaWGTPa^9gKaH%9+!J{$@_6Kl$fx at w8zN6eeinH;@>FDFWK(2wWP8ptk^hW57x{VQ
z`N(e~TOuz-ejE8^<b}wuBEO5g9C<OaE$6Q}|CRH%oWJM%KC(6PV9p;SgONW+UXA=E
z@>=Be$WUZk<WG^mM*b_ZJ at U868<8D3e~;{l{3G&a<gLig$Uh?=>(AMc^JLD?a-Pch
zN6t{r&YWi>r|gP2qw8uqYDUMey>?nG7LPYJE at +B3Cg#?TYiRB4NHmV0SKnOUmZ*)5
zy}2_{dsBQ~eMh{lrL(!tUEszRI-}d_8*4^))GzFSglU)CB*bEqZ>_0I)J|z^sp+Vw
zY#LYMy5k#a+B at Pk?G<&Eb7PR?&WtbES6RgePR!&fv2pQGMz+S4CYn3y+nQ?{CrqiD
z(b3XYGq1j)a at vfd__!%mGjDCJcZ*!N)Lq!x(p=x%F{Y_zp^J7JbEOCzQ`_3=NsCQr
z?rb8nqbr&c9qsW%Q)^>I6H#-Gl=PPO#DU!O`qm1PkwUK6WvZ7zM at 60My2UZ-;QEEF
zjfvWXN+{;U`T*!FO3|mRJ-kk1sf-;<^Y3h)o2Y58ucL<SBY$gW7Q1e2?E51ceqt3p
ztOaBThY>D?4dQdrztZ!p_Dm~{eEGmhBV!hmxc0=n14r;civ-Kk14=LhX(clf^P0yc
z+T*P))N7bnQ)UXDna3<F9bZ{oRNGiz(-ub>&R$#J+L35!R$ZMkTy6C;>F!vJs3aN^
zwKdIku`%RE at tVfQI0jWvNm{3}&O!D$VNA{3;hNswUSBzZW~!!zwe4-`W~WOM^D3Ho
z>8YGs<TknEVcbvFx3#3Jsn<ZyQl)Ahol)PEZW=wX^6e%H)v-^VO at 8k3Y*}J8%|3~h
z%ijSMOLK>G%>yW!CgS%=w6yV<rbKhV`}W(!MygpgQUqqP!_eG}l+=MuV#e0Awbk5;
z4M`I=BxC2+&$BcpwH8lW-cSnLFTqA7m~Vr&**9_M&3zG<@j#CiV<y4Q7*kPmO}3;%
zwAhFH`=Keg|D#pseQwCUsR-M`h8SxM+L35(h_|#IxaI_{6{EJ0x7IW!n&%xjO(Bi4
zYO at bg%W$)<zQHz)@5GU2K}{n~PWva at -U=x{WshPV;5bI{*#rmCjyUtmyq-Q|WLMXl
zqe8}K&}M3Ews+2LZfR at MmP^ktoff<G(r8MV?=3zMdPp!!|Ag>X?2~M{{-?^$*tbGC
z)A_oNHg*ZOc0nR5f3?aau9?d8w&>0E^8(Ab)swyc&Tb}Cvt`ygGWJ{&k9ji7CWMH$
zyKI<>ee$Q2EtHM;K1r2cIwn)8wtKA!dIm^*vi|+F=md0HL!vp+p|tGZKuU4?4%|U>
zw8Y!>MOM6nF0is$2CT!1y389^P4Byg8{ZENDOL*6kbcUyc}%Q`FIed|)VqIlaZwhh
zjrN54=9;;U_3=bQ#q64z_BO)Ym~sHRPNiK(YYP_a_=1X=aksIfz0ItcGseZ~f at R9|
zizm&jxMEto(w(O7UMZu*bnS4YgU9CLj+Tm!G~D#gtD}BiTTP>8)eKgOH_>jdX{rZ2
zV{Ck`>*8~Ns5E^bRBSWKFt_4i*EN-v8}zUpR^W&=rk*a`eRr at nVse!U8;vQ)+c8b;
za}SJUjFI$`hhG{))|KFf%XdxPRMXs%Xm76xG-RLD*_W|lRAxK_XSfBA6|1W)(Ww2?
zU~f=R%W#u{?<Q=R<9n>+5_S1*wv at FCrp+jcdzVXYmYo!)iQH{JH3cWQ+66P?G{gCa
zn$M3yDq=qJS4<tF!FrR5(^T=<J>wCQt>Ca-X>jaiD>;O+-6^vb9!7gO>1Hsc_KnZb
zVQz=rjOMssQsBJ4mS#D(TW1n`qhrOjZS^%BSaoX}JL_p<p$;!@M~5 at Iwxy{F4<?<a
z`lj~!j<BU`3|+TFjC#_bpE9`Nh3T<G#)Y-ywwgqHJwLyI<kUxv!*)|v&NU%<QJ1tr
zHrm3R!oT!Qswh at -p11AAltS&=U!O_tqdUldAaNd^#kV?mfZyDjXo!s)jqPEU4YGPt
zaMXu+QB%`!`X)4WHg+U#X=$q)+de<hfM+oDz<I*tL|c6=J*jeIu6iJ&?vc%&moJxR
z#^SSQ7Go3ZV#m6FT8a*YmT|$^JF(F96HB{N=sH$U2yFXO%chDBDLmX79&A8cd-e at z
z4A$9~AsTcs+wg>W*;KxVpa;&g+-aB3YHV`jwT)zpebkn*hox$hz4d-Ku!vi at KRUB*
zrcI~sVRq0d8)pQZa##loo0YLtryNKEvoiK;qRV}}GT#p*XXPfEmElCqEgkivFP%2)
z-1d%|+M9-7R%5rc;Ub40WZyX4WSYf&u7777X!?z4`T<d9*@lNHv(9;0G%9K58_CX~
zzh4r;JRmXTRSW-qCK1-9-toJ#xRlOt=>Lw+cD?h8H?*}hnN3cp1gJ{c4TTphE|D8D
z+z`dv>aoFTtFLp1t0bjoS}BIjVZ6Srt+^#ljj=8js+j=>S#08l8NlALG2m1Qc<=$y
zLVjS*s$p98%h6IUKojJ;;JsfGQ(V_bRBvw9)YY}&fh1eMem<NI7jAG@|LWpx8D0H|
zY}(~xPLId12G(U-7;he- at NnH6oZ83XMNzuSYMSeeZtrO8sG<1QCKb1}(2J;93Y=72
z(V<=+&AwjKKGsX5ucy}za}%o#OibxL*I|jKR;+`sRA7<Hz7k&L%&L+{Q72U8j6Agd
zX6)Ky@!>|WcL>6W#ZEp$*H0tZHv+U^xwEmZ=C%uXn7D3cv*o&-FReo_+ua1;T~{W+
zX8#x{vmYoH{zYtpZ>$gJfpn^f#cJ__cSoaoL#_grv7CI+V0Drck0;cl>A859(|a7@
zS!nv at xakjDGb>BlZ*J?TsDsa1dZ7j(T~7?3Uxeo(_3a(FgTx0OI3U3tN2G?Ezw~G<
zsl~hM`q4}n#o-CW-Qn{=a34w4 at o=@;ZFr^c2Ptp&)6$l1R*1=0?C3dA1|5;9c#OuN
zc>kc_eJ8eC>WZSZ at zyxjPI?F4VodU%neob|(v*%WiNlrfFU^eB+R}JyJ?`GJmmL@}
zG&(Cw at fL18>62Jg{3DUZOifMw%|seMMRDy4#q=b;qb;!@fme8Mjo0Eu;f}UW+{54j
zW5tXy at WQd`ijYgke5G4&r6-YD6vQ}Vi7rXk&m1uoz+(R1 at HMPXJ=w!Wk(gfo*Egz3
z`PTnqjq31bp#N<R8)o8<jm29~@mmrd^W$?{+S*!fp-q@&TF?W<jnO0T&SrWK5WjMw
z9>fcWhwJNq(-DPC6~R^4G606rW_BV<L-;NTd#dS|qaC*OJWW-++q$n+o9ok5wQe-W
z*VT8_BpTa`=~qN>S=^4UXlbNLt1dw|#}#;eVD8NLjN&kx#)_#ETNc!}H8koTq~fvo
zbo}@ho79U{i*=e3?iImmsh3j0!%iv#7$KbqzH&+~7?;)Qpmtk<_d$kPl_s}h*7vnC
zm2Q}>4^Gtwrs;!H^a1JlVASlR7F;xBe$#+$TGbVm*Tg4X;ZB({BNKPU(PG%&w!~G-
zxccVWmb!Y(*ClcPHK=<abYOe=zU!c?Yv0QaxHd|CkKzEeCA}X1W{tPb`uERD=$ey$
z`;4wS>*ICx?X_)*R=hyyPN!|Dw`Y%)&MqB4gUU=!776SFQ$&x`H{*8USjV!K9xI|>
zY}OZ<S-C>UQ|90!Ed6^(SYq`f_H4Q+M2G#E{s*NeR)k+!j`M5n2eMf~)2a7v52Oma
z<@R08Fn3(Z`G7m#I{7Ft<=mT%e}t(|nT|eq_338Z51`SOdH(&;>Tm1&+m~pTUxB(I
zr$4livX at hH&6uhyX3(5AT#;$+rNd5p$!F>~(loa=*3{PH7qgpNS}UriG$BlHa at BAA
z>9;ko!<-q%B(L_R{LTh0a%SVb4Qw9hv``wS_k8&!j^TV=C=MRINE>Sms4Iqax;|6t
zN}!Gar9VS;DOa8P*BRY}I_c1v%)E(Wv<xTIa%{ko?H9!=ypF1E?<kdDN}SP^ld39;
z;+%+;md3{zj1z<v8;lc##SF#?!iqE&8|Nj9fbDNsc((VaPg6vVdRl+oIVlBpa#FuK
zx1ZuD><}Mb#^0;>@4T37{_|JntFsvO55l)H57CKg{-a-|l!>qfis|pt7;z^k6n|qF
z{D>3(StmyEH?+8*-$JIoi61*P_3!)|0lEA~2Kmp`DgDnOlj(Tz;5poWA}2yWD>z?Z
zl=Qd%Wz<(Pef96-KT2l!Q~SxrhPP>d82 at 7}{C|%6e*Tx??|G5^4)N)C4g-iVN=M$O
zpTCRxHvXF}{7Gb{{0o at BDNpTx9`*hFmyEOXFJk`a-jv|yA7;Oi7W at 6eqM!aQqRoC)
ztiLctd$9jMh59!8?PmL#ex~-*pYAjEFnAdM5)1zU3;$C>_;)e?>(lT*A%uTE|M9$q
zT%Y*UpN`9vf0>2<FD?A9!rx=G+28z~(#z8DzY_Ir_U~c-a{SSsF3XgErG at _s7XJ5r
z%+9~)a7}zk8vb{qzK#Dn=D&vPll|#WZDq=TwuS$%P~SiP*1;)l_BVfj^`tcXe~9`v
z{?&KtipI~0|A|@fue0!f(ZatTf6vUu-~64}v(oUdMtvLqg%<u;>N2)}tA+n>Ed1|3
z&(1&p2u+;+j|Cb0^WQzFZ{xp``QOL&$^Hcpl&SwLvhe>k>ihlYVk~N;!oU7hVP2sy
zT1VcepFjOSi#GlvKdY-q|2Yj%ru=)DzbQ}ce+l*d{7(w;e_9tPd{jr?r=Ne={Jqj*
zzb%mB*MHy1cKfYn{prTv-KcM~Uzx>zBcb2e!`L;9|0)arH!b|n58>aRA^&k9{AXME
zpOXduHO$|%m-_!L3;*$E|Lfa74KjcJbDgNEL#F+|)?&ZkLWIBnAH+Ig8$S-#H42yO
z$our`{}Srk`oER!SHv}`{ggKLNZ0 at CE%y5z>ihYh8nXYaV*dQ+HbeP``QK9(`@L+@
zA4dn-?AOZrt31s*;kVzlsBg31YPMfB*C+dpMLJXa4Or~=Pt^C at Z+q0v-~9dReVu={
zSnRjUqW`T=*!7#glRc8#Zz8|_-avhu{hYgW4dZ8IKNQW_ey>{i{~q=I_S=Yqqpkn*
zKcw}iJ3nkdeH;H1Ec`FXg8vQ+|36sxKY;_LjeimIFVW5LKK=H86!mTVi<o~A*ChLw
zA)TrJ?y>mq+o<oi|7+M(+xWLK|9zc554v6x=V at 4--*;Q|{|pXl)4!7SkI-VgPrv;h
zM}3?9sxAHRLQ1Cg%V++kJlX%hE&LZ?gKFd7&;0q%{f6=nn?Fym=zqte|36_6oBl!8
z at 3)u#dzk(Li~e^l`hR++UH^{#Z2w4${;d}ME71A2_UGqmlke;NTg3XM{|;L8-%@7R
zU&Q*;t^aMPZ}Z=UmhoQ&{l*{7yd2hl6D|DzXyN|?1lsJ^#r#h}9U1)o^K;a<@n6II
z4Nn?BGax8a{^b__dr)8L67jT)?fh3W|8(ca3e>mp at 3-*3DhvKInZIc-wf|p&w`oUz
z$o^-5`O}?^41W8E_5U1;{r(I=Mz4tv!2%0vf%D%E){o(x68!peP~T?1Ef)J-2T8^r
z#xKJ7&$rm`J=8b6O?*=b{{kj at atiz4{a={=R*U^!vFM+PbH2 at fWvrjYcr5((y8`uX
z_S?bsTgWxZf2xtr)P9RB{Qrdd#%?A)0v%!FznX~_ppFcF{)eExjsN_Wx|Zo@;$I6W
zQ~o^`{(nJz!`sBSUu5Us&qS9<@<{vnFGhVE|1RcV&h?3Z1JarDzsJJ=Rn#}UO}wJW
z&cBKW{uQj>M1KA~sBh!H*}^}O1^?9+{;yg1UvQ3{|7zwRGsHRe^Djkx8~^;z=?c<+
z8UbZ$|A#F6Uq^jocP~EA&VP{kUq*vN1V8`1qwV}hGXE~FPyW*aIhpcbXW_pW^*!Fq
z={q6zFXD~+c)^{Le*WJKvHzOS>xxBOpZK>SohkpPEc|Jr@$-ND2)q4Tng6w{-$Z`?
zKS6yZ(|7-~h50WO{5!MYKVadXW8uFY8$`f?^>cvvr+fbKI_lf_SKY0NOZ!t>GPVC!
z=5NYV|3xhPFTn=c#=n3bwji9OBk$91|FHG*Rg3+i7X4Kr_h)6S|4L7 at PWbg-9&&%y
zWwGC4h%ojr^GX>19TxrvS@?es^MuWQtC at c~|NVE=xAp&8<}dxXD+~U6Ec_3)@LzSI
zoxl0daj(&4_dfmh|03$!_>a6tSCRa?v*4e5gRU>nUx!%uKR?ROzxp^`|9%~LpML(&
zqP~s)YUbb1HEI0Zfpn(+bF78`p%(u1ObKvc{&4u>Al>-;CF<JvSKo^a1mow#zZc&#
z<zHaopKIa&3JBWx7i7qPE9%?$FSPK#GYkHsEc}nN at b3xXUzQ>N?hyVfE&ORqpQ-&z
znZGGd{dc^D|37fRvDv?x`SUP<W*z+gA2$D$S at a)f(f<i_o=tyOhV~27UvBBYqb&M!
z%Ix}Av;Ip=!$aD?CuIM>hWoFG>yiDxfVAlkGp at q;&$RGA+`_*V_t!T2?O^_=rs01R
z>f6T87K{JUT$m~UY774(Ec~xN-_AduUUoo8xBr}u`ZoSMEcRaod71KWwD3RD!v7Ws
zwAsIk`KPmg8|vHm7m%|c7(XHZc at W<-<-gFv|05Rucj2Lojejfi$M!xY_{aZp)VJ|3
zwebI17W|i5_ at 7|m|F<$b|JBTYWD5J>`DF<8ZTzb&{OSLF%+&rXnZGGd{dbCm|5EJp
zZ1x{u{(gJ=zlY5q_gVCR%%Z<AME at Y`$M!NMWU7C)rT;!^(SJQU$ku-2_#;xZ2m5a}
z>f8EnzNP;jf-KV?W?Y5Y?;(r*PPOnqJ>>aK5i2-84gWCxKeFgQ(W3trbcoG<Rjl9N
zU;gi5`qx|ZpJdUWzy+^OfAxO0|0#?AoNUp*>U6vQF4liq8vB0{^=<ysW$~Yfv+$n*
zi~R~L{BzE*^IyySalB3mnfBini~T-i(f>xs`FVi#r#pXc3pqdE$M##u>}mdc6zNRI
z->Vk>BP{%XfO!gV;QUj>hJ$%ig5Q4h|CieOKmUum8k<2qe|`*5ru=tU_~%*pAB+Xo
z#=n~RuSsDaoc|ouxA7mz{L8sM*}p#v{(CI^^DX=z17_o&f1*~*+c0Jk<iF0y^xeO;
zGJoPsf%yLnai;dqjjL+TkvdZKk2bo1DVrg#M+e*ZuVwyoxW0+}{zLPe&HhFA=^E1h
z8v$j?|5ywEV=VlCezu)|KF*aCXw8$s&wn%O+xRcE at P7t^GUZ=j;s0R^|C4dgX|sPn
z^S at 4#N7~Q-BdBlV-*4gnTo(LCS@@r3;XgUV{ssKturCe&i$d(5ONT84=|9iod#3g;
zW&Wl-ji1vk{5wb3?Z1}!pQ2bKKm7I&+dq_9^v5jv+koL;VE!Ip{rnoAM<7%E<re+L
z7X2?CX4gN+`i=bzflT#RS^Vb=i~hf1-m|rT2kSrADCXF&e+cz${$IuZQ^fVC|F<A*
z+H(TqVf{D9!v9PQ|2)i-HvR?dIB at Tj;OBoh>f88tF at M98_`eKEnev}+;s0?9|ARw*
zf1rx_e>{bKaQ<;Zet+OT3;#c4!N0 at 8f24)~Ga>xDm_NIb*9yP=p9<l>*24dlEckaZ
zf74&oe`j0xpD at bqKdYHPFT<hy!~D0$V!yL2`k%_P>+fg%r+S)o!f(GPP~Yain=STx
zEereIW8r^}h5xx>^H+xaN20!s|Dc8cwk-Isw)FoP3;(Y~?e=#*rtQt!0R07 at e*1^@
z{~C+^=yz#UTOr59_k^6k3RwRLPqR+=_1}g1Hv8?i*zXO9Fyq0 at YhnC<WZ^%`!ha{;
zlLQ=CKdYGk)HMA6j`}wK`Crl%tGPan-+v&TDgS;;|DS8&KM3VE{;T)1|2JFgH`=2A
zKM%3%?`Qqp+<p`J`+p<q+w9lM_A3(h+m(g=UbOH(&%%E<HV8KRRh_C8ryD<Sp}vj(
zYUXeJl=}ajEcg#v_!n9DCr-EXZ)N_I*?uPS+rJL=ZT#0+`0vSr|90kY{GHlA*24e9
z!|nY0nLmbCO7Qa!8~-~k`o~%HFUI`^;K2AFVEuT!lM*u3zlZh9^YeI%{@-6>*S~}H
ze>z2baQ at zc`nLWXu#6vi|HJf$@yjs#<yLFOrablECoKG%$J+U~7U=pVI`Td<wO at X<
zt|IkMu;~Bl2)q8Z1-d>j<N6CeGu3}WwXP!juhgRdy9IXr1HdUXppFcF{c8&B{pWmH
z*IL7s$^P{G(%2(i|D9f~D at y+7TloJI7raE#|NEz9g}VMi9eJOA{yR|L=6`FLe-YOt
z{`CAfQ~pKOx}qsh^(W#-=|Yx?KNP~hpZW7L#4IxJ|5Da3?RTt2|J38`_8VmVv$4L(
z;J05n>f7wMh3&VS8%ph``zvFQbp1cs!v6vb|6hlU-?Fnb>t&KW(tiE}sBhyxuv%A?
z{ZH?IWXiv)TGyBMztF;e?ge)H4>130v>5Nx&;NSVxAAwrs;e0PC;Oj}1^+qKy1r>I
z)xQWo+?`&$Dq`nf#0UR<eg0L)`lbEKEc%}fv0oMIPiMcUL+rQkt3mtG^IKz&bpG3F
zvERiO{)-N_+ixxN$MT*M{QV!c{w-wv(tej%^dE|G4mdD>4zT`7DcXbMCxZGm|J(6Z
zT~UsoLMY1AeoHO<Ct3KvJI>C(0MA7z6zj<Q^xN;BsBhz6 at HJh<^fUQC-QQ)(e`U3<
zFYWJI_+K8vznb}TH-z$^62iaK!k_MsGUdO@!hf=be|`x6mCS#+$6P1;_CF$oe--m*
z8tVBC-G5}te~pFz6bt`fjR^6dQJQZ$|NjN*+x%xO^Oycl=g&;}|H#7sQVah-pJ?ab
z%KX#qAO3*)HvWSa|DpT)O!@a)_?KJwuQ<ide<ky;)7{{G`u(Q|^=<rjTl|Oae=_C&
zoQ3~(uu!^0d{oH$cMf0NU+*#32|xebkoWIq|C=r^*Kaz1Wy*g`wXT1pj at 0<6u-Lx<
zga8NT|7zyX$2gV}oc~WkeVhFUn7^Dq>HL)`|5vNEO3A;{!k^v=u<`H8kiYj%NyKZS
zN(@=})BZD4{yQxEr&{=L4O#zJGyj5AT7muROQ>(Nzw>onQQDvOPnq)HW8pu|!oPTu
z-TwW|zcda1b5Y;M{{#zvTK_ZUpIf69OaHmT!ap|N&fomuqSMmwABFlh{zVr4wEkqu
z|5ywEt1SGRFR}A48m)<^+dm{w-^Rb3`4 at 5fY5zm>N2dG>Ec|C%`2X`_JO66tPj`+o
z_~*|ZsBh!HkolK$W#UisN2dHo)#!@Sf38M-r3+am{ueHY00-yK^K|_t9eJOA{<~4%
z#=oEWOZ$`mX3D>m`J3|8{%i1Kc$@gikp0^L^B?6g*9kxWu=7V5>zDScvgn_6lHGpl
zj}<{>VH*8asBg314z{11|7JqJv4_FK_*Yu^Uuoe#WrUr75qDg=`Qu{LxAD*YhHkjw
zN&QFumnr|*HM+jE|8&$h{chq9kFfKv&XE6yP~XPCz{3Ap=+BgYT}_bx4AeKgO?+vA
zo&QSapYHzh64bZxFJ=B~G!3<Xx;_j3trq^&e#6_uOW+`Y1M`PDxGrS<Ci0KJMW}D%
zKgj&q42pks7W@}6f74&&f7jv1 at HX*k3{b!U{#|_V#`ZQP`1yy8zoo2S`Y(-Nqu0cd
z<L&xavwn^|4u1Xr3c0_`|E8`Y$6r148+#Z$jQ>gt|5+CPIcM7W4>Iu)9&?@W^WSrZ
zz5hpA_|MOR|0)ZAdj4<hZsPMp`age+=KHBM{Kuibk`+lUSjPP2_`4|!{%b7!|7zji
zg$HyH8}Q$i%pc23O7PqNQ>bs_KV-3ga~Ax6#QcrF(fIo>Ffn#F at z+0U=f9TuBReJd
z`G@)Mde$$;-`_0yCtPUPKfwCa-M^NgzRiBlw=``t|55wf at nh_fZv1Vw@PEU?|4N)w
zAU4qdMf~9Jeoe#s^xLlz^=<qYGJom63$oz<qJ{t8E&T5SX5&A={L}d_-Ot(h7cg;S
zXR`mTS at 0jU@Tc|D_^pYf7uosmVE)~n<#obu|Mx=tr<D1ZbA95!C=34EE&S(L_>V5K
z^Y6lQB?@KgGx_1?e-`T7?B8YKe_Iy(cU$=1VBx<2_q2cm{@>61`55Lkz|a3?)VJ~9
zZ1JDlv*3TwT&+^hKXD8HbFpsN`0rr;_j=5A!q0ys>f87aGXH+APyVwk3;y{Q{?!)#
zFJge%_^-sdjKbAA@;?3ipGSQg|JDDlt4ROv$%6mM7XCFB{`ca7!Nz|F^Z$XzTqpee
zSE9a+|B!`$Ul#mFTKLym_?JiR{MX`MhXSVel;Gz-3H5FKi at vR^6>)9ypU-B&zr at 17
z&cgrJko*5Z=Kt{&_QCtxj*$ERYUVHJ|IcN?zs$m)_Rq>z;AG;0T)X}AOEvLy?=PQ(
z`ZoJ7wD7+t3;vbNAL3P@{-d*`;ceoZN7?xoF at KmhCHVa(Z2vXWV!wG7{oRMz^;fa}
zbock4Mtz(8R$A<Le-`$uw(zI(hq0T9ug$abZ)N^fDb@@6?<~}}@n6IItGPb)->NM5
zH(L5XVc|deU_1YQ=D)A|?+%Oo=34YmLi+#**6%^q|6TBt!S8?AK3VOT`yE}YiaUzz
z_f`Cu at o)kQ4fEeF3;!D}{NI~s=f9GOA*4G$yo36-{-6DQU4NmD)cwcTv*3THg+HA?
zjo+I1|10hMU)-pI0KS(yuBEnKT2FeS=%OM~a82~?Xh9 at ryVrnK!j%Jmu;h|7si#Rc
zCY!1S>B(trDB}I#5B>pyPy~;b!hvEVJ)tPdA@~LA5B?(L2Za>+!y!k{nSC?4^<|c2
z at xj}9Z{EzC&(3?B?CzE3SGd4HJ&^w%@h?z(7JB}F0)L5rgZSAW^Z!`{{!aw{iopK>
zJRnH?S-3C9688LX5DX;#<f`)LeS-X#BJh7o{Jh`LzY3M}&hpwmnZHi__bQD3QO|$S
z{{39YA3ncxT$UdOynkAu_+js#E`Yx`SFDfdH8Mi}zKD>&HG#heGcz|!7w!{a+JXJM
zLHzIk7i-|}j}G`t{0qdNB!67L%Mtk31%3zonVTiv=SchuX%+Zv2>;Ch{uP1$Y6Sib
zfxj;BzW|@p68|5>Uk~A*1%E039f5x>0{`y<{~H4T)d2qNi~=u*@UI5&_XPegBk*?x
z{)WK+;9)uc9pZl_g#SM9m-27YgHMwDasPiEf&aF^kMF;D-?Ka)z;8aU0?&o;zZ<}x
z75Kl2!0-KMy%5{~{O$_;e+Td{5q~CxzZ<~6B=E0C;NM$Rf&Je<ErI_?O5Q&k#2 at zk
zvkW(IFzvwaA9l$<uAM)xJIkQ&uW{n{=l>SW#eOU&4$Axn-dIApqD23w^V|de-dwT1
ziPu=A{I8H2_y2bh@^1?H at 4{Tn&GOQW;*V+ZuMj`x)?f7gg(t}0Zv^cQ`DZn~e*QuE
zPYC?CVJ_xo`9lDI{HO}7^XmPe_(>r?zJKPpEO9=p!&?5vDL&gNnjL at cZ_&TsOR|FG
z1o<a4J at d0X0e`>Zw%;^O{(c>{b=#Ydd%C)YZ?OLj*q{Ew*E8#W0e?KlG%HA^FRRAd
zekF1L{tT7<C&=yw_zxf7cA&dNDo<=)SG`FBMt>|MR0obru>W*+96Utpmv}^(qTlyW
z_4sc#^7VSDVL4BkUP&t9Yb(w4<fNxdO-((mOcwT>vyaclhIWXCjOPaUFnV?vkE*0S
zkv8rE%lS<E&&={)@%B4g7wxUKz1hXVi!W}rcSAq3ydFP!y<9F^V-LOOd9i(Y?Awk}
zZrgXtW9#-E!}C(sU^ZW9)!dd at tmJE_%`>G&bJ3}psY6!Edd5toTHa_1k`86uJ3>y+
z&KW~*#fR?SyKDCfoDRM-!iwclJj~o3KOZ}?Z!`x6(8aN5Q4i%|Jn;BQHI6E&;Jcg=
z<JPt`HY}{-F)%#f(|atSdkm`nIf8TcprSorOCB5OK7uOQuY!KRw|k)hWkd<%N06xJ
zKCkcr8b>|Wt!-5 at KK-aqZ;~GOr+<&8H9hl=jQW0+*|3UEwOXpVhE- at b4c@2lQ_e;%
zS86nBPOjn<^DZo}VbvVBWF32H{>i4BFTQT*KdU`uqkaB_y=YjK%9&~|f9e!$jMEgZ
z^lGl$$X82-)pQ$fz5w#=DoTBx!+AGft1p&Q)ArHX)O2pZAeF2v7K#eb9vq$A8O=F`
z*KDIdA7$kKiF4rYyVUxV%~A6I>1$Mj*Ok?{cBn(?FKOz8-3y{G?hCdjH={>^c%Apa
z03zYFgoQr~@m>dY`Z;`zMo_d at QT_d?M%BF{hT=G^Xa58%ND@~6GYcN&`fb1ISN at K#
z&Hk)vK?8IH-z6O~c$p1J^7*s64h<=PCaoWz7v^slRQBim6+kC_mt at JGgn|B8bp8)O
U$2Oh;9VW=|H|k>-*GP~5FGLtLcmMzZ

literal 0
HcmV?d00001


>From d11c90c0f4371678d2f6b90a6ccb04c592981842 Mon Sep 17 00:00:00 2001
From: Sukumarsawant <sawantsukumar at gmail.com>
Date: Sat, 14 Mar 2026 19:54:34 +0530
Subject: [PATCH 2/7] nit

---
 .gitignore                           |   4 ----
 libc/src/math/generic/libatanpif16.a | Bin 114088 -> 0 bytes
 2 files changed, 4 deletions(-)
 delete mode 100644 libc/src/math/generic/libatanpif16.a

diff --git a/.gitignore b/.gitignore
index 9294f96ad0adf..fa133b2d09834 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,7 +87,3 @@ pythonenv*
 /clang/utils/analyzer/projects/*/RefScanBuildResults
 # automodapi puts generated documentation files here.
 /lldb/docs/python_api/
-
-/libc/src/math/generic/wrapper.cpp
-/libc/src/math/generic/atanpif16.o 
-/libc/src/math/generic/wrapper.o 
\ No newline at end of file
diff --git a/libc/src/math/generic/libatanpif16.a b/libc/src/math/generic/libatanpif16.a
deleted file mode 100644
index 9ed17bd932e5928927ec005fa26d170c465af316..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 114088
zcmeFa3w%|@wLiXdj*s|iqE%~s)DRy<Ma&5SLPd?j#zX@~9*T;dggoGAUgm+=LJbCN
zLZs5Ft+rTcZy(&&w)D1EYEgq#K(DQ-wUt)d#@1 at +rIsp1^iuhM*UXxI*4cY@&N+bI
z|NVXLe8`zO-<etKJ8RZF_f8lwv8Jtd{)LC1SCATt#8;h`p9KZ6lH#HgRi^rI$8o;v
zIOF^O at 8|z)@c(Nc$a!RrBYxu7PK(9j at y5mlP4ULW+}a5>9W~9ZiH6uXck-0jxVYml
zFs`Asvm?<sZc5c9iH`OPRmh!G%n5g<s&RRWOs-OzXzr+QYp!XWFr{jGeQQNiYhy*_
zv>8Ql$eMX;YrR|K@;4#bb;l*z<IVN++yzACKnjkHf#gJUL%gLe-dxkH6{HX!UsvBz
zlW1%&F0NnLT3_4I9-rIN(ipF8X|79jBwCs)=DP0OneiFL88o_7HF`#UQ^svJl!k9}
zR&rxx%=W}QbV8zeUP!C`-IbMw2{(5p>N}L=X*1#_GoZ`u2<^F4FB(4&H5b%0X42JJ
zks6Oyf3m)<B@@BfIp>&~x$PFmVH1|r*0gt2)afw-=Z=r5ZEc-U-&`}du|A$?sF+<-
z)80my8=C-%4UH`|9n>$aEf^~C1r;;n?(q3Vao3$u5_d~FX#C6=n}Hu@*S_(cVB&R2
z%HZ(3Np^xK6=&EPGdfz at YUb7ZM~z!TZfujGd at hSgBP&l*-IQofbky5aTTBv0KSomu
z4aN9eOrzm0Bqz-<S*7m6)|Te_=8iF#&Gt`JtmKkJO?!Qvk%(@Z1_$#rxMAx-COl&?
znhp{TiQ1axx{~C8Oy^c%?H*U(T-#DtuNeA?Yfvgw)DPrbT8tr^obA-|w685mvCNpJ
z_PP7ni?ZF5DvDJX(h^%&(cDqd;ktTC9+quO8%!9BV^oKkKZUfJe1?@djId;1@}fbU
zGwf8G&eW9!`xCTU?@er_^+c`a={gtf{$M8R6eeog8dG(}jPye+WX7VQsy0(<`-FD5
zw#B$9?P6T#X0s4yH|@uYX^n5HY3 at k0x7TDchlVsZe6N_<92g=tXhSupdRv*W=`R+W
zd}~czqIL=`;T4rl<4Rm_Q#`ZT;!wk{>sZzkyr@?uiqMlY^iJG5ec5yu*R)sERnCpM
z3&`!m=cd+{##`&LzE{*;R;L4$S?D&_w|C5pS2mTV>^Dl{7^IV1XBLGX6W~eB^=*mT
z*jUVzwKv77_uE=Jo9i-|;6l2{-t%TZkus*TSdXps^_ZOIUYfz$G0fanN=yCzFZ0PM
z(lRrKj_OPn=2CBMxY3P`?P!U&>rGc&Z7ajQM;z$F771_X5Lkv1 at U+AMqMdMS7bG%p
zso^?F*Z|IOW7tT~&YlU(0=kv^>X<3LF4y(_^hvL|9S6M2g=n;1o&IdCH%oLK&6HTK
zGPybknG=H3wmJap*9kf(Gwk$iot34ZSZ_Nl^0QtX!~D6}4L=|GC&jExx?hW{edgY;
z1-9;gYH?M2#S9i$F-2zR1!g5m>s8h%7d?N#j+T(+P2c- at v?N;Z+tHGQ%b9V1kCu>^
zGVA(?V}FDNSSrk5D$aP6#nK)tFG<X!W1QM#R?aPQn`r(_sEuAm%V~ywSRA9x8n!Nt
ziQ0rps5^%J*s}*`JfDu#b3y8wVQqDj*Xq>5`i|PRF`m%e($<9A&rc3_fn%!}E8?we
z99wH|^jkN}>2LKKJKEdK@`tTGjoB&FFP=2B;)-eUN_Uz*KTw9g*h{|y%h<il>|(Q*
zT(ZI at m;J3j?7-URJ(qvP1_tN8HrJc~4)pdFjFY?W{oTH%2)mKCocq3gg{+jhEzs~L
z?AyaH5i;?iWWz&`8~eQYC1p1q(0y%+iT86?pCXH9Z*9+g_0g2>3cEZEPM_oI7k1R+
zAl@`LFzo1&#Mqj)wwhZjnkt)OIHxV#&n}ORovZGZX+6n!85w4G>)%Y&E=awmKA@{q
z>f at t*?Iv$3IRM)ePu7I?&bfL;%i#1|e>1i&;a9pTZi&-5-#P7fd&e{-tPg7T`5>aU
zNWO;e`-GAD48|uXIBbS-1{yZ8SS_x%IvUmU#tPh|jSGrUS8PRb`5;g=X)c}y;k=+9
z{CO9E(=X at edZ`@|6x%q%GLH}^Z%&R)XzpyH>o`2>Z^C1zc!Hk6H_<W19n(~^(BeZ`
z)X%J!hZ)od;+~9_FW-JWu!7x)4p0kZYG(&#y|Hua=h+(-C_08_(%?32LQ`jBN8*;2
zwz^cm(6g$Wn0`6mweoB`Gd>%ash}5_l9Q6gw$D#AVEa#({Szi9+VEZiUPZvm6Y4CW
zrfYM9Yox{?wwT0XNL!ZfKx at o&pW<;;%GV0o{igbHJ3r2*S1UT3=@D&xozI_VjG^0N
z|7$0nq4u|V?zGEiH8#0L@!Cc;9Nuq5Mo;Q%G?;2t-`YmMHb83m4Cg}o4T at e+;7R&*
z3-X;*&t~biV~sw6bv9d<=al~6FN(1uOfPX>sfJ<dJ-lyi;fLbBmRP2}cI)0GSg<MK
zUV-H=Ftl(>m{ngj*QYI?mULUDq-z(<jMKBr;N0Tx4@|7qlfB(%V#T#>^)(&!^uA4J
zJxv(ec{)O`YrRz at YzD`3s(ss+WFjzRd9oP6dU6G~lx=cs3_XvD*EBZ9u^qL(e-U&b
zY}EY|Hc#XMpR$bmSTVLN at s<VkZFtY;7D%%BqNScWai{eu>#S`NXf2m^eTnV+`eNZw
z+|fc_&8wf>2_-L2^y_&Y_4C?l8uc2|jB68GhFp3Lr at f{L-dlmq5#GYXXU2CW!?;`K
zTZfdXF=@QmSnA|k5*_pLs$g4N%Psl<rB~&d@#)-PUYPOXAg}l2S0(WZ1;nfy0E=X+
z{f1eAvGABT#m2<(1^=eo79-IdT>G{**3{PH at nv&MYem(RCWPrtuJU_sRZ%?Hu(4>E
zY6mr}6b&OlZ9xJ<C3Rd>;Cg{xsf>Y|nya&2zccpJ{oN>JtH{1l at K0cAUb~^~ZOY}A
z^=zQ8KhsXw5SF}-JFu_asAqcU%1oZ=Ww&tou8gy}PYAgrwJ(r9Jy~B(cX(IRzKJ{h
zZhCwJUKge7W#5|{Rsm^ymzKN(dFw**d?zdWtxL+(==bkoCSPy!km&_Un)?jek9d!w
z_wzo3WVz-o%l*60u)EuVyVD at 5sdpN~c1Xye2tKQ^oHB+L95$?jlEe0*O_sB&WiO$&
z$e3%?7CG%$vF?$BoKjzW_DKzH4QW3E3LofxCh+c-x>pLky11{eZk77wzrW(&SLyGk
z at b^*n`zG4zslzx5Hf_Fr7|-j&yx-@>76$fuW8Jp4=9ZcncqPjFJ)=OOSP}gqQoNpu
z1~0;5CCS`)%iJ4XCn;)li~1G5u;1GoSE7GgZ$fE|Q`Lvx=2lLL<mBMj_jJf{E_9OL
z4C*)qhb7T61(!L$MZH<&y?-fRvVA3>hP<cyorYd_Pea~Q?jEOn#o~8|sHB%TJ*UTg
zXSw?h#k)N;@5j5lU(bo{Qf1fV<SjcGb++9tD|+Y+R<s1 at 6GU(5 at u=f$bpPq({dn;|
zSt1DSf-WuIP<(IO?I@{vCb4E<D7yR2ynN!|4Be#)J*`^x-nJ`rwr-T2^|7esmwJM%
z1~#uj#cQgu>f_D4qun8BR3Ar?+J?N*8}jby&-?O*g3Wd{r=nB}(bpnW%@F-2zAaZR
z=yCsI8Uv~BP~P4BTH at o%T%^sj?V*4a4 at myY&`!u8YFh}}Ldu4g2qnLHA|G{!Za~|H
zZpBBhJFsN&fRpPudCLlrg9>b?M>em`S at KRfenGTn*=>9G?#)|zKJsJzRQw<UKTY7W
z6H_boUr at drvhL~E`O8%)EnvA4Lq7Dp;+A*yGs<Gij(~_f)$VN{M^ZL?OWqySHM`vB
z;a~K-s+#^?Xfk!zZ-ns at +6>Bk+Ge+RYiZO&*YuN7 at R6@RRDS_k(TYaAnKygG&?05G
z{ChQUw7ZUqQvXgxYWA~l=H0M?B&;QPt+MoTrEs<qyoTU4%Gw`R+)S7LN;C*ENaE4%
zYAQ`1Nw>M<X66d`8p=D`U5Ns!HPG*_9C}%)JKF6 at sf@g-Ks327%IZ?&{swV*BR3~S
z<R1sb8G4wd+_$aJC{*Pi^Ov8antLBfyic`@df#(ZGX8r``I4t-{<!{Ud8g|6q<edQ
z at 8Ugui}wsY06pd1Pvuiy7s{CYg7RJ(r$@Ux%KO~iyPTZ7rDr0S)a+;|&--+L`A?`c
zLRWcgGmz$fDm}D~I$xDiUnw=Dh9Q&`T>rC%m1;ig-d^O%=fc;xJG>eyHK^+1gMC07
z2H#I1t%%Ry_*{<1a(sb`C!0>0oOkp=rz;BD&MpLBvjKF9Z1(uM%^s>Ym7c8aLquVm
zeEPVuAFXO=7 at c?OQqrsJ$N5K3rH0IcE2)zCv+y;4D!yLxB)KvPEYtRet;x)JkD!kr
z%rCDz*<aLn%yIALJ%aIHgfFpv(|t+()VJ=Q?!k-l9?|T4{77Cy?{3I>Z)g-`^pa}#
zJybZt-Hk7On2b&bt|bTg<bCgYQsCK7>mw=Oo3&fFAjKZk*VI0EBY9!4eVkw3H+=g$
z_mr<ty=y!QZmax>>%+W8K&t+v{eJ)6-o2jv3eZMn$g7bshE&Z>?N4d1JrH2*wPWvT
zp1pPg53?8Q7qGpRop)cPC&{6Upm!UMD~#O1p^FtA^uvz5d5>&H$yY~g2G>_rCauAc
zU%(EM(!Q7fL|u{We`D|8!o{V1ev8@=gRp}%BMfrv^sBE<K7SuDco~mZH6NI1Wmh>o
z1WI^hdy9YX2K4e!LvOAoszh at Ae9Z}tMQD%vHX8M|N}ryS_ha^9%A)dRJv+*J-syTJ
zr)TP3T8G|S?jlAp(6`Df(b?MmBqbRnf1<5)VqEm{ywvON=<D18Hak_0`hd~j=v%xq
z?~%?o`1=O5EANqKCTz}ou-_2Y#`Meg%+j+V*1u~5*uTv?t`C6}7-*xF>_x_M_w7_+
zC}HF^^pXvFrv8h(b-DX5?Ccbm5Wd^dg1>^W-V`+Sx!dy|!JzK^tJfgx3;Q~EVt11F
z$SmC~RCzD&ksZ23pZlKZwSfIn`%f6!+8?lQ_iox<nC4-ivPdpp?fDWdQM+gW=nn7_
z-oEKhvYW~F(D6X^o|FWe-Pr4~{@#t)oW8CaRgO+ZO>J>CDR29x6vl%~EADY$q9LcM
zzhtV@#%zoFqN~r7)jcjWhk6Wq+)W^`$w=6YjLkv<ty<-q+;udby{50xrB^K030^<2
zYQWIdp6O7-^AG9|TAFuY%{XK4ZA!ntGnLVmy_4(98GHBAoJS)`2W$VO>tBDqncp`-
zrYBj*7hIh#$06TXO8eXL6>8%|{j|;hovPDxvK+I;UQ7TiSGH%lV`c>^tOK`4%~z at 8
zeY-NUZ~S1GE4ZN-LmXp<hWNdn+hT}IePnBAk==e+1JToalrY$L!{Fhjs4x~NdiF-)
z$EnS#23AupX88AFv17Bb8*AA#gY<6f-k7KAp%OT9(^Vw-_NDZ+F6HS*pY?X`Bokp;
zFv61K)W|gQR9S2drta*`l^RVxt?#sao9~&FH@<7}N354SU{ad=V4Gfdr|Bq^?RB?f
zf_oQvZ({X+Q*rnD?=UQ!w44~Tc<sexXHKkouG4Kb`Mf@(58qP)A?1v{QPtNe9cbD|
z?N$3VV<?W3@^;XY2a;;F#8j+L$SJ~5hsvX^$+Flc^-bM{MO0gv<yrh$PisDZW at Ch}
zUk^y<X8p9kp at u*1;BciScqTIX8+r%ydeqyg8?|Ebzf at 6g7CZ;iU=$ES at 1Oz10n-z&
zp|IysH1>hNx9wx5=nxlu4PK3+`q)lIXrCl{5#>1u`!@8x)6hFr`G_2Pa29;!g^-3d
z7V>8fqj}giK_=TL`ate(Xh)+vWH>(6G-uQP9tS3~;&=(qL|;ZDf+#Ho$)#+g`xm-i
z`HOK at d7)tz&G)v^m?uf_o3*MNFp$?`qSykz+)}<_=xj=>j2`#bu<RCy{I%}dUiUYs
z`5XPkyVGd1Qlk at wN4UrRC8Ye)NO=JnFZiUWm2i{0nVMvbu?`%0f$QUh*A70PidD|^
z2Mq{iQ<|ahMSD9`>itT&-fNbp>@}g(%&)2A1;!XuOl_lurdP}^<$ay3Lx~<N$4c-;
z-(a!G_o(g9eow8RQ+JoI!z~9wo>P8WPU?DxwSMZ3@^#cggf8Tl at 6Rf##n7j;H<?<Q
zX<zM?-0+#{*T~N=>)GAo{!49S^+HB-I at TY$_#z(Kkuaq%clY#l?$-NMJ-4HbZRAod
zkAIQhUj}$fc<81}52AAA-J5cGNEyfUTuLW;==3tsGxhc5Q(y1Ng;EVGZy4y_R?=I3
zkO!9cZbVuSFj#Fsjk?}TbG7x<-2y2^F!L4hOWIf6xE#2AB^$d%+JE_WU`oHwzxLkd
z^Do at T(Eg>jbB9`NxA!g9n<bh=&@{}9?jXqPF{FpymU2v~vY|?~XfN$==mJuiB-{PX
zl8v<X8f$va1Rb6|;W)I}*JZHo at WN{H<$kPu!(fTIui9~2`3NFzb~>iK>L25b<v;NT
zVoG77pY1L8J1`Ua7aJh9gNda4|FQiRt?$Gc9ZWumsg7sSq#aWDdS$6in0L%(#Izsx
z8L9rfkMrx at q4O*4M|PJ#u36=>CnS5>u=V_*jQFs9ksOHadNdb>w0CTBhYIked+|;u
zZxvMbxUb^~wuOY0cQ4+Olh-}lWb1o!HKIGWEO%e01IB=ovQ6E<P);p27`^UhSkX)X
zQ#Zpzo7|mBA}zq({W&ytwmkqpg0qz+-Gco=%~z@~Kvv1e(u6j1zd#M<M}>Qv2f9Dt
zlG|}sq50F at nY_jr>~jZkS%i&LU+1f#HMjP;TlEIX+w&JC$w342qy8mdLErN{=<zrP
zrI5q3C+@<wsb>ow!WYB&4$TnARwKjPkh6T%U!Ff&I;@PebT8iF<bB at SW7m55!*m&4
z34<z4&uB=n%isk&Py}T}pM5!{&xAeD5*kC;bYU&=EUdOXXuJoKZK!jm?(DntptCmj
zUV2b at EfzXc2f9$!e?F<|8GvLpLdie>u8h{}_Aht;j^{8+85*EHW%;b<F)9akJ(*LU
z=WYOuxmmRjey6;dZp6W&AMhZu*GW%ZM_VS at +|9bVRjK<8#YB;y at xS4fXnD?NRYKWg
zQ$Nj*-uGC4(uLSw`Nn at V^$kNMYQ at w`e{wbO{Ef at wGOlRJ8#MQ0ECdUqT|&O<4=q3J
zwn>zSJYN5$dL<8Q1x_`9&&QdL_b{eKo<3U7a`x=qs}v?hq~u3J@~LP0Xn#k}hTGTR
z`b+IM^H=)wxMlw3%}VNRra3g5-7>bc{w<o~@A0E<wd_Cj^BT+t%FTwz!;MSA_mW?>
zD33ee-IBLd4?S{Cha4B4Ie<%_!_fMT?p{Yt8QaJfsz!HzF1hix$Dkb9Ip)3><0N%<
z68)(-oBe8X9pnBeGNVcPu-e8D571Fd;2_hdE#H>%-l=F3e$-*Yr!Z+B7mqerztT-x
zj{cOp5NZ$WFChJ9MZo%_n%cd12y4w1s at 02kt4GA;=X7E=cq2y_=qpt*P2=vXQJUR%
zb}xRlEN>}YlWjYmK9nFeg{UQFXxleoGb{vVO7Zx1FUBp~GJ1Hnt_&X#wJi^^vPa%y
zxFXqc5zSKWPUGpwMDXgXy#lZQ{_($Ldl!nTA>IrAo7}-N`i?bK#b_s8{@S0d6C;q=
zf!^4?jh=~bbYJrBZ*elk=@j!GVk&_}7PIAsp+k_yG)6P$;6+HUrR{-ljb8)7v}9nR
z>i)J;(z&KQ+PMxz`?Yy{-PP!S4R&wLks0@?409~*#m15}m})B_b}g)_?kM}FLW;W*
z-m*<kv?O)v+VWowf>;l-?)ym at sXvD>a;%>6!^qE64P;R3*YFNu&9=+YeWp`)<3Vi~
z3dkN|e=72fZKnFZQcyOd{#Ce%lea?Ic9XjsGMsI6b_bl3cV`OxQNUn;p}H^^Ugqtf
zX?W~L25qFp81VVSj@}LH&}YtBo4be95mn_c*-r1YD2j^SvJ=p%P40Fz38=xntN;j1
zI>>WydEa9i>C{8f+g+#|hdw2MZhn7{8;9SkZosY3A0Xfl`ipmO at ic}8gpop(gQ1Sb
zb@%6W?>;r}&d-6?6Z8aK<vlUdq;$~@y(^AIy=DD*OUEHcbwl4Wg3hIrOWx8hKblJ@
zy(^%~DifC{$^JUsCCdqltk<}-*C|6_Asgpei5jxe{W)C}{am#L*7*gj^9%jOv(C35
zV59pCHnEGJG3A~`xo3UlZW~_F(<I<&BY=j6K6iQdINL5%iZ&3T4W`_aDEFjK(eWtP
zu*t>79-r%p+)wfID1LqdXWjC;_F*77KytP{gZ at +Svc;;!KczvtI{9*q`bN(xl^S^P
z8X)(j<OAuvPiJd4QPT%@W5Cy%YAaE#$GGN-#rWFUqw;7o_UN_?Kx31u)-<qrGjGL5
z6q}#qta}sAa*n$t>Mredm#UW^wD+Mu9xq?<7`>&EWYgy^#i%K-#YSJ{q_SGJ5QS-*
zSia(6N;0=)^9f$@ST!P#;bR(nm_U!`Pf at B0KvE$9C3MBZg!|OZ2E^(pY1EYyyyCGk
zL`iiTe3(EfMOjD5sO?;81e9t46e$R%SkE(@_WY^q>Cx_r at hE^HzAT0qH<XX#x#ar1
zA3sK4x!$s3q``$E1C^L`FGg=ZzR>86p*wi=62zl-h9AwP(Yu7uethh5e~PzS&Z5*Z
z;=*q(?Sk*+Eu}+FL*Fve)_}uYL$6A1TLHLS%r at SDq=)!QpnCghSQ!-uK!(~6Ku$pC
zXMk;5HVU*0kb>>bMl$b_hdr|~q|E8Dr~CNiG}))>cW=(^Rf%mQk-H0{F>fiY9l>(>
zTn?E|-(#d(NoTE4I1*5K?_;?p2Nv|w-W0-D!AqW^r6jom2G&1pHb at ax<UuU-Rg|P-
zY9B8odcTR)4JR at WQ+Ie<%Fjm=4nqa&4wMe;gWJ9Vj>GOina}%spK11&wPgokqi|8u
z8s7f$1JS?S;TJ0efk1!Ig0J at YusQaEedbn_HrsTEv%>2b|JH)KQSO(q;UfV?9yTNR
z>g~ex&ZIz4k at fRwzwO;8nENfudm#b-o`O`znhEI^$HG-Yz_AACHeMYuI#rXE_ha9O
zgGDZyGst_r!EIlqap2ow1^9UF-!FS=dzkxt{YW&QY{&ZWe089q_a9|F at Ak~XD<)Iv
zY3gouJO4K3;&)+ozKL3~_zeJhCye)1VA<?mAu;+?{|G+Gi>vJ3zp<WQ_4CU;kN?BW
zFd^r2IsmEOvF5+`xOeG{$CC8;5f)3~Z|+ZV2b6O7po0nBlk^>?&yd1PxxU>$wF>3O
z<|A0X69ZuuwhaC^-*gQwU+wvS?<{p7qg*<WsaRhN^%JC$`bzDkUG7Ufsk2vaVb`Uz
zz`AYpzAxU2qok73ha;Fik_{s6SDs8eE<OEnT{YDLn}E@~8N_jtn>x6602S7z7NjFn
zZ?2-C&sCVval{Iobd#5<WA*UCcoHwgaQn+r+TS~M&}%#2cbBi2r5Wm1lIThTQ<a`#
zd^w;!b)ftRT<WU(vs^?V!I!v7pP6gO)_t?kKQw3I0^zJDaa&p5d$f2hC8a;XCB<)&
z*A`@86;jF at e>3x0M*c!~O=<4;|L^>6JHO)sC~$rsM8Ez&b$$<7KMTzKjB&Grt~tcB
zHF{?Qi-7wE+)vDGL4VEo`BwM#GW9(8ZCdfMs^Ec)nt6KNt$m$avH0jq0X-Y(dwbdm
z^z-zz7$dWjzhEHaF%5Ql7>EN{nz)X*zR?SA88<v-e3|t~j4N+^qndg at 33t(0K0UK>
z{Q}Kjan~>Q;H{*-PI?VfFPw(^;Bt2th9lSw4#O^KRwJMK!|aXC`?iA{4|Rf%v%R+n
z`H7BsRZ>0A9_*R=*7B)uspr``Uro at 1-P;b=&$GQsc>ji%d_3sIjE2#We4ZWPm%6|3
z+t2G?n!sq1G%x#lCl0#i!K_JdH!D at RwONd$d7_s5<pTW-iQgDP`4_qT=Dx*9c5XIy
zrb(Etu1r5`?@aD at U{6p{8%l1b-k|bDHaXB?0jEK;Wq)}4ORWFwioEb+qsL=|%NKd$
z>3{e0Jq-Q#wjBp!Vztz5RxfhVEsp&yEpC5V$~$eeH4i?MrD+D!$}W0%V}C455BW at q
zVIK31(fkIesNSI%Kk8FYW^vWDi&_x+kPr1uVE*La7_>Z?F*G*3cR0K__(&?XJUzea
zek0FXQfu(aABg8sbo-<nF|eQY=5v*eKEw2+lD;>f%o8H-{Yn2LBE^@_qoMQRlI^WH
zv>TJ*sDGdOZ3uo!YJU!(pPMeDl62AgI&Ka_-`XCh?@g&JPHE~^nYXrmWd|KhcaNAD
zmMnhpy+h$I4~9H==tT$Mveqbk;&CdCW$sC6enJmt_27Y*8G89l{9cy%l`{IZFTh8;
zYv?tiHR$Qpx~@rhQ=IxWIPdORm8N>Tx6M~~r4Qmp at jlvOcQ1a?$@@wO>61}lqx)cv
zx^zFseX+M6r^zFF-TOAVFVbnH&wa6bXe at 3Od)<5R<N+^*mF3+|0}I5=h2}PW<?nS0
z%9&e#wc#-)S#}F;uxl~J_AXmV9S9M+ZG8``@4ZioH`GQvS>F9D?t34jT5LXjg{dzc
z6+{j0f<7V$=0xtpik!N@<om^KqcudAnaR6LQ;VL&sLM~IV?MF8+;nO@<)%})Vec|=
z(@BMlAiB+jAbF`t73k8SXW12$sV`&Y?P!s%Ly;1r3txn<%T*P%^Z(`G@)ZQ(nltam
z&*W^@_ha3Q?+NecV=#WWkMHSSb~4GMlMt>)4 at J69AqT7H;<Y=p&pt*OrYUrthNe)Y
zAlQ8w-Gmw;e at DC4QT20_BwRtW5Xvw@cBnpoj3N!`!9ucj1EagzKO*jP?}_ at P5NxCj
zA`L1Oc~lBO1p`1m1bQ>?>=krJi0`wyFq-Zqo?P-8rAUvGOK3*%hU>EDNo;NBJ<OZF
z8jS;5BSq;odL{Z%e7XBzY!`Kz-Zpwem$n~jAkd3rz0202DAA`XXxbX1C?$Q4Du&W)
zsg!uquK`-*mX(s+J_d#`VX9=ex--Sq^s&fJUa#t3p#@|f`aY~AuXt>=(C?+S(dbE^
zVX6>(!M140Q#3!TJuI(3 at Av+-D7nAK1`wNI?D79dTQ~J!mG25@)3t(Me_HPT5xc)M
zn|M5z^wv+cCYeXH$%C|9ujpQ~d(iym2w5^^KZq?OHYPallmUldnRf}%FSM=0PxyJK
z>P`HDshiI>PX4Cn(bo?GtH(2<qnifs(MfB?;(pZ)Na<f!#*5AQ^n0NBvrstydKAvD
zN8$V?6wdFY!i)2jpYm_4t^+cQcdgw|Re5I-(Vi+24!H-GNd_diKgu%FuYM1ff4w{0
z2vRE<BiVFp?laZLs>q`1cXb2ymRhJ(iWai04y><enD((M>cN02sf7w8wZNB2!|?K1
zG<k>=Eu~UkKq`|@$}b4;XVHU6(NZeGNvSvo8HvPK%EB)^aAp^NhoOs}#nOwCWwQKR
z{JYq1k&h<7%vN7$%5vVyn==(Tvo}zYohX7|+PL}|`a2EF=t*4O at +0xN34ingTc>UG
z+q$X<K2&W>Ih5aao-U`>QOHheTj%6rz?1m0yf1IeyXVQWrElU*I{ak~?4y%SMfG6l
z-TX{bQT)Y=j`;B-uitytPoH at 1kn<luvvTc4xvO5AJ-+-~Pee~Y!~MxgU%TkD8z=ta
z(XT%6#G8#{p8UZv_g~a{!#mUe at aVnj_dM_d{}^Z1T>Kg5&Vt&;n&x>0_ at j;O_#?Xo
zvC`48(PIim#yVAQ>@0`Y1PVEyobKc-%+ERL$U}2iVlt<VA^lL83adC5e<=Ts$fTnV
ziIyLPA}Vj9%5 at z2lK@`+0?O5O=Wsr4#l8G_$^1o}f2z#CHkrSQ^J$ao)xSQOzn=52
zm-&&|s=&#>wsQVuDfzCKzlZZ_|LVz~nUsGr>_A}&eVFz~7J2n2a{fs&|EgsDIh=ok
z%#U2-)nCN<XUY7!Wc^i~UqZ3bA3ZOJN>KaOb3W~Fz5GP7{#MSXr^;S_q{q|0hx0?*
zAEAnlL#GI@{*<KriJTu+e-7s>A2a-~NY<ynwLu}wKUQ&mn0?oC{?R1F$gfPw-<qcW
z9?qvDgI9lAvi`}~&{DXZK8*Z`HUQawBIncng_mEItUrhIt7LxUa#eupFXH?#`>o>q
zk+ObmQvQ0*4;w#QIiDs>k6%l&{vOT`Z9k2Bgc0;%+CL{*e<J78i-TVNmC5`$oKNRP
zFJCub`R}4M^;dCzn0?lBepvr(<@{5memy-X{d+h+Z2X<9rgp~(^RJ1VPcLV9?W+;(
z=X~mGFaPRf{vyr~o4;0Zeu1oiU9$dq&JXjCt(;FwoTq<kvi=^<FP8aD$^4TKp$x}S
zi=gTM@?`!*&JXjSIh?<QgqZqb{#?ZQSES@$rWBCPR&hSPe&f||OzK~sM*mjMzl`QN
zqd)3;^XDGUr{DGW^4pT~PsRlf1-d5m@*}f6eiJ!AY<-!-`C;?dBF?`uMZTWERQuED
zU(fm1q}1077}ej(`S+*f-{bM$!}($SPNsz%AuN9)=ZB4tIce%I;{34rb5)x9>pA}t
zB5C?}akBrma(>wSxQFvY`~NT!>^NcdCvyHdRLJOWNa~-%`6Fe%n7<cse%SoDiu1$l
zyPosI=I5=PA2vVlNu&Q{x?)5Kt3Q$RPoqMn{WpmAbAH(Ty@>O}#@8y&KSRo|PRd`;
z`D<i;<lj^Q<^P<2Z%Y13FMki`&q>MG6ExL7S#5kA=b at DRZ+Z16a{hHG`FaCL_2+PY
zUrPQRUj0R!AGSWN;{34rcRlATH#YX4q{OTBZ7b)WEA`J$=I`PBvt at pLGXG?pawvS9
zK8*ai$^40&Pi^(`#rigf^G}rddivDsKj(*yuT^RC*K>Z@{Ir$x!^X!R&YvpzMf3uv
z`d=A-nE8+MyHo0aS{X?7f13J>IDfLNe_PVNt2m#2GtaYcMKXUq=ZCHDTRH#JvVP=N
zkKZ2758EG}tQ0#=n14>>{DzeEwZ>at=WzbQ6#ea<{zaTmuZMc=)5jx+=FC-`KbzKR
z<A2ffA`r0g$U|@(jh^nEMK^QVWl~n8%hNT)`C)5iF1@>j5H{D1<op6EWVndESUKm1
z$)C^p^k%nLf4ZWl#zGJ0Pm=lCl~n#3&JWwGZRY&2u{^~2%ccCtGS5D_Y7ugrF#RJr
z-_}>;>*btZmm)vz$)C^pVSC*k&JUXl)}+bb%=raUzv!PK&ZjGK&;HsKm3?#7BI`I|
zYuQN753_GM=f5N6M|LLL&-qO$`Fb)Y`}J`CD48$rO4e|Gq0AR|A)7frjQ>!Y{9HAO
z;IVtiSRTpwVPm<R^HpTVhd67@=lo*HUz|01I6rKzUc>pHlJz6{vWe`sne)Tef+5Zi
z+l%C?LyY5ujo*=+e}9Vp<X*a*^TYJd=lt%J`pLDThx5y1{-=`j!y3*XE%U`)&t}e7
z&9vBmi1WktLb-eqNSB}9_=xBUitIa*^M8<%|B&b3<(z-6%olr&`J5lNm+0Yq(&gzF
zdx<riAGVj+%=uyS#}MaNN%`U~CRZIo94E{_Msogl>C9~G8(niS8GtyKbAFir%;)^-
z6#aUFqWV3YALc)6IA29Zzqo7K%=yJqz8GIaoFBG6=BkCyaXu;QM>@U!9m)BvDfu^f
z`Q at D7kdm)2L#Tc8Ie$({esZtf!}-%w^7RBl_1AEIc}l+C>{9+_&JVNC5a-8aeQ}qN
zs}^R*3A5iw&JW{P&iP?`@%fyumLb!B;x4RbnEV28Uc>o6l>8&#Q}!o_rfu3L(UG3g
zbhJC1)3nVcjJ80u4WKkF2gh)lh6dS<a8h#ur)jK_KN3!RKwA4LU4WkvoTl~WbWWei
zG40*X;`BL)X-%VgG<Q*4gdge$N{``qEaGvTrZt-KK7pU}Ieh`*3prhe_+m~^;@Cwz
zh129CRR1!LD><IVG0g#2a(V{hnVh}~@ztEBaZEC<!w>l^rElQ48gUJ$YZ23)hw{ny
z=W#lL_(o1QBBphp%Cz8z#t^03 at YBxePQ(j1O>69}oc<KYixGdC(_M&{aGKVWrJTM4
z at n<;Qi+DMw??k+U({~|W$?4B?d=JN8K>S5c-;el9oL+_a0Zu=Ncr~ZLhWOt&{SCz5
z<n$Vjzs>POh`-C}?{oYEj>(_N77yd+$DCfr at uP@;%IU`tKhEiX#2YyMGsI7EdL!aZ
zoPGxJvz&eo at y|K^JmLXPzrgW}9RG&nEr@^1>6a02<@6sA|B=(LApR4lU*-5U#IJLD
z8{)rmdOOE&ApSe2|AF{TPVeM+7vi@${SM-HIlTw*zc{^@WBQZ-WVa|j4&wA79OrU;
zIN~EXeH7vkaXO!4dMSwhOga8J$Km66PM?7Iqntj8<5LiSjMD`i7a~55(`O()lhY#+
zpT+5O5Rc;YXvF7nI)=EI)AW36ET_jKF6Hzm5TDO!`W4v=IbDYMVopy&Jeku|5MRpa
z3dD5pi+ at ffKBjW|3dB{Mp3d=1#IrbkHR9Qvz83LyoW36M4V<n<T*K*Fj_VOOaC$ys
z`ZM(S=iG#kMou>)ZsGLJ9JeFx;Pe8-w{ZGa#GmB!BE*Y1eH-E~PT!8Wo72k>-@)k~
z#J!yELwqNvKZ}_DWIp~m^qcFS=kz^@@8$FtIldq9mpHu&@dKQGkmIi+{u-yhj`$m#
z{ubgjoc=cA?{NCN9Dg72T2B85;vaJQVZ=Y?G`*<y6HfmW at p?|v%X{>I8~>aQ_;`}j
zPa%Gq)0;Sc7V&>_n*QiI{y762zd&C?!c*8l9_0LkhFKs)>F-zi!hcO7;2<CBIL|Sz
z_5&h!9pm`|qvY2aKf#BuWW2zK|K{Tk*hHKvR{c at VKjv(Ww=zy^7KOi at rSSpAX+5Cu
zRhCnIu>Nl3T#1bU$?syEuL_+Tv4|7ClJR48HRtsrjjv^V6yp>{)y_l!NS<1v%MWt-
zqeXU9?Op`H$*6N=5CjrcJDXruR6n^3WJJBU9+FuL*$##(?O{xas$G%!mW-p$iODaV
zS{2jn>OVvS9~LE|`X)^vIvI5e0vG{NeH)RIH|-PT%fHAm{!_-eM=}4hLp1;zUWhtp
zdTE)6s#`XR=ycSfU0X0j)eg>vqw_5xs&;q5jHpAqy<mtsr9lu#RP6+VSyAVLAgIch
zh2Uy11 at fYf8^8#NI<%7vhN#+E+VILC2qfyz&NCRIPE`;D5_M=-8VphAsvrm?s&6d<
zc~P}{4ZtXGW`)$es8gK^@VBT_8^Fi0zZJ2cAHyXYKBoP at it%E`AHa7tZ!mro<F#jK
zd^O{#<E0^>mw>2~2*Dde at Rks~Er4G(M)MhDKB at B(?b3rG>MRU`K%x%q-h(0PEDnM|
zqE1%`-W`J95rX%I;CF`LcZJ}e55eyV!M_lK-yee0$s!n{&VxY^NYtSdM=(U4Zv;Ug
zQHM?<!4P%mWD*Qf{iH9D5p}*FzzB#sKM28p7=qI&Cm5p6x*!N7>d*-&7@`iHhJqpL
z^anv8QRinN_{I?YnGpQB5d8TN{Dlzw#SolMV8IY|wgf>SQHM at v!4P%mq!tWO=anD`
zB<j2xg1;7mZwtY<hv0Pb3x=rkk01yn>g)``>4X>zQHM^8!4P%m<QNQ5$GW|R-W==B
zII2#WHaUlcl+O*p=|ma~QHM^a!4P%mq#6uS=a>L~1U786zNiZ at R4EYEqUv-j08U07
zI_U;O)S**vFkt@{f)|9~bV3e>sB=aLJ~9M9Cj_SxbudI7I$Z}t)S;7hFhm_Xbq53X
zT_N}dA$VB`PN(!>h&ps)4~D2ir}tops*}78r&E0}M4jm&IGy-|A?ncSKNzA8-2?<f
z)VV$cuMWZKwjdayPJIYIKLo!i1g9H@)X<3y7>$bpJ{KG#vZD^&LIgw9p<9Vyh&psb
z5e!j>ZYzQz>d?(aFhm`?#R!I|b9)HBEClZf!TUmRx&aA>sJacY;dC<+3{i(}NrEBj
z(5*=@L>;<82?p$Q0{Gx0DtD0eWEOSk)+HFC4&A^6L)4+$m|(#ECj_Tknqa{GCj_S(
zoM4DLbej_l*yn`cbjuSAQHO4Pf+4DIe{4A21O-FXp<AKUa3r5ox_De*n5Zy;@qWgq
zGyXN)Q>by2s{xLj!igF$V!)is7c*Yw!+)*%cXNQx2FCe%hPC3}oN=SiWZcjAkp>WP
z)S=t2V2I-0z>mIgE}jELlJ{EbcRf((HCzD{Mh@|cpU<f(9Eb`dhk5uYPTc__g^>?=
zcp;}gGmh{iOI)w1aGENgJY8MS_#7Mz3nRz!H~kEUzI+A!QW(Ld6o(}oKMs|Jk>owh
zNsQlw{ws`}Vv4KhQS{|fG^sFhnum|()Pv^%cM3e*tdt{^{4+h=+>d?bWRf$|!;Ri=
z98P%hKIC#PafY%R4woE`<G3CN-oi-oKE&Jz+zm%4j7&4d^>b-_dkzzLVdMf2H}-rQ
z9aI=8^Y9W*ZGqzyMrL at pvB+W+EsPj#{5uXPbuBLX3M18CdDAaPpFnu6hZk__NDwHD
zT<77&Qg_1yg^`4ZU%{!KiwJM<aMO>Il)p84c#KnrtA1(m at XK;^@;@%dGsp<t?CM|=
z7sK%iBgysgQ=ET?>c`~zX!`5h=+MGQave1N`(b70<i5z*;i3zvd~$ufhUH(4aa0&d
zu8;RKelOy}NOFBNdOtay at Z|itjLUxvE?yW(j$d<<yjb}gjKsmnxd0tn7%^Uq=Q*7J
zYh at q0@z%j4-i%WDQUiN_^$zY83nTd8U_5&}hGJnvO*SgW$bSX}3L~y5?u_8nFY^h%
z%){|jg$;0w!bqiuAJ3^pXAxc=z-NMgVdN?gM==#Pf<R%!Se<`2135P<KfHlUC4YDA
z2g<M7JRD|M;VI?kogRKIr`}Za(n1e6Eh&d%6h at 3U@jIa?b(I?Li at ow!a_UEFymxuH
zk^eR2KiwW~TJq~rB>xT%H~zB~NMWSc!%e at xjznSP&H!Gf=8by-_yXn6Mk>G4j#Ag3
zPjXgy<&Aw7t9j!=4~KbGIJAt)e}TU_&Ly1MsqFBmhnw~uq3qx9;cGbcGdN^n<YyjE
z<AlOZYM$Aszu<EYr+#%dm4C*=AL3M@@`vX<{5np}IF8CE&xz_CQsjMB+3hZ`Jie;%
zwi;ib_i$8K;oeV>oUeJf8G&z=5&jJi$8b=g7Ky@$VaLBag`CA|T>OAbIgS|@Eoxl+
z(8JBR_}b|t=V1>&jZ at 3jcr;c_${DHfZ+qq8ekz=;#@F{f+{i!l43fXj!{Od4Jfh~G
z$2=UxR5%#xui at -vB5|4Om*+jGk5iw*gisiH!NW04sqj{W at GTyWX;X#cP9*$g4~JM4
zj!^6PA3XfyoO)j2uXs4RPladmNX}~>Zv6IRqX>W1!=X-vF^3Vp&BIOkch!3F4-cQp
zsn^u_+UeoOe|~ug$=T)MW`w_|)`fRG+>~!o{kzA*Cv$3{8W%5m_)(nto#Oc$56|V)
z*Q2C&yN4V3UsC=1hKHj&R46!{%J21XG*g9c)$hssA50f2{22qhFcJkT9X`$R4+{uC
z+{5YFH-&FOQDG$4!%e?;j3qpIUsR%tJO2&GE{r7af6UB#tiqG`KX5-4URH9B^W-mM
zeEtO_CwZT94dY)^{+7JYG5$ZG$|vu0O!?anCOOIb9GFFgqY4Qx^z=?*+`)ud7)jpe
z6f=JOIKq?nHp>~mQsE;#ImXX_JA%rO_HZMg{wIsVNb)|%*x_y%wlHFLsrnrxQgNo5
zXOj0e#-CrFi17<s_`?O5uWb0o!36tEkB^b_CuN`HJx>7(ECpfg6TI at JjE`6LN#66I
zTUEFYj#?N=-owDWDpZajJb4de%70Yh$$J=sPc5MG$$J>%hq>tJ!btL7#q`UeYQCNC
z at q}0vIxeB|$$KZ$FHz-h$@`@yF8|*bQ+ax`L<cj%H>>_l-aBF1QQ<QnjD59N-uU at 9
zm0u<Ah0M4sDWdYpdy+DibJ#_MC+}6d8DFH<+1ow&W=A*%eO?$z-k;pd<$req;XPjY
zs~P_?8eSMl-k+Eis{;9jk>ve at dJh)Zv?$@pdlgJ~D%__0_H&+mGjE at XiK{S at yjL;v
z_AP3DPTs4S`MFokd&zqh<A>F1o=@JZRI%O?<)6uWm5(xhkQ(pFdlEC=e-D9$k>ou|
zHJ2|`{`qZ>znNE`#)MoLN#2tj$K~%<{+7HaG5+?N^8e(0$S1k{l^7sce?9rFjQ>>O
z$@`E7#!sI><&*aza~Qu<;Tt?T(;0tD;mP}ua>grSBqw<vV)$b@#d+!lPY#wj6^y;<
z`*Zlg|3IVyrVSMc|2cjPexZid8!^8S_~inp-GV7!Bk%!%-y-l|3j6_qzaa2$3;b6C
z- at -VCf#PKJniZV*(`svQvjP$>d77D=@L!AarED?6rCu|068;-e-pnM#2SdlxYi1<B
z-X$!b%2OX1o?jC9vjYEyz)6-V|Crz at +jTMtMxe4U;m4G}G6eq$;}l`b<CXudz-7Dk
zhTtt!2!XO?`8xzI`F~5`vi#3N at V^QiQ-a5n&c75$uatixeL?W!Pccr>>-aJDiIPwR
zDo^%5lD;4u;lq#Bu=8ObZsxs{eE8{H{xlzM){%33xY<XI@!?qRRha0*Co_I2<J7Jl
z_%ZETCUDyQ8vJN-J_Kqn-N6|Ac*aT2<pMYR6vAl?n(`k<ngZb@%lwUkqX|SH{I~cq
z<+06Cf$-nq$KbO3%OUtyfg5|1oZk!FoaYGtgTRZBra<@~@niIsAx(ksLHrop%%g<M
zd29^6Qy}~m{Fw6l!{<*z&c#SmAo+jBkCA^Rfe3`ZiXVf|BM^b`zu?E<?F1qa{u+J^
zF6~VHW&Y-Z#KY_?*NZbymjcO`_Bmgn8viBIMz2}Ns66Sdqy)lj9Vu)GX;Xea<5Y}J
za|Z7e_+JIyBk=zc_`L#`e)uhcOZh(%xRk$1;M<Wle8~4G5Kk$eWKtmfZ}>6gcM*s{
zxRn19j5P{`(<#rCHzyau|1NQ)DG(mkzi%SX$dUc~mcR>;ra*nN6F;WB883v(evxwi
zDavQ-|8g9WO^n`rq$yCl-o}r?k0B6&@ZI<^xYYZOz|TgS0+oLkKc at Ug2t*+Kzwu*m
zDM#{=Jokw5c}P<rIdYtu^?~qz;m0QbJ%LMmZWXxXlkNQWCn2W{X$r(=5I=^Gw5Q~M
z3d&HR@^XHjtk9sGmJr<Rul;tA{UYrXHt$J0$aP2d<6dwu?J5v_9E>T0j}tiU2 at O6~
z;BsD?Bk&iHHsuon|CPX(3Opj at KOk_rbun_jD{v|QQGrYOG?!8!dmbd at ysl7<OZjgJ
z{9sX at Y(jzLNcrXlgK)alF+9yp1>uJZ{6u`GKzOdePgAJI$)4tK_DDZJOq7>$==RCz
zJxj<rLg3~cOnQ$LxICxO?URus*Kz3&V}$$<iSlPMPTb{uX#9upu=((Rs$Ft^I9;?$
z?ti2`<^Ja)QC{vJ<oQmnFJ at f(?QouulPBzXfxzWBog#4Qx7Q1tZVOF6wh8=bfqz=y
z^mM?Kzf0gB7P$1oV?%JdjWlv(`QrutQ6Zn!Zc|?R=LrIr^X*3kF6Xxs1%9rOPq&>$
zzT|VVz~#Dmioj+0j|p7PpQj33wu|;2HoXVR&qoOPVdGcwl<hrL*oSU$4bOUkpC<6z
z1b(`}<@|7lz(<JkX9`^Q%f|&S#}Un?hR+Nk=PZHG6ZqKzzg6ILM`7gj30(S#<UdN3
zm*=2!1un;<#0!LcX-~Nim-f6}l$Z9D{zJFlhQAya=LuY{zeNH!^Dxcxv<4YDa-Jy`
zxLlVdAL;*NM0wd>x^*}5PZ8~s<5bF#{+}08K5V>);nL5;#=GP%*G=hHa-Ns#W=!yu
z^OfwcvBD10{^JBL?J!>8BZYjrLo at a)75D^!mkIn60xuEx^GF*x@|^fA at PF~W<=vvZ
z^xH=Se!h at TXH^R1R}z0kp&Fkk%F|hl0+pA3RX`vD;j(<KLWAYyenYO0lSFyCV>IoR
z^Vo#~m+RoKP}h{dQ^>hUlwTuoTC>gHNc;~UZqDP<Zd88J!MY%=trV!eaz9CH2L-|<
zpGyQ at Cis~9Qh)h`D1Wgi-z{({XQjZU-tP!p at -#Q|e!b>~obcb_$JlebkR$cJBXFtr
zSTYs@$&q?T2wax`q`;*=^az}6X!v|d;IjO;1U^ZWe?s7s1^x?xOF1tK+!f_VMpd&^
zyJYzb1TJ}ACvd5^Rp3%ix4@;|`voq`KO}J3-e(0a_5M-dQjWPdrv8$8k01bnB6`Yc
z?Ef)=Un=mc1YR!iguq{dEF-5=;AC<$zTOqM^ur5z{86?X7Y_+s?$6f=T;5YXFL2Up
z=7G0E at Z%3bAq3J(<<0zHem{_Ksdu6%Fa3NJ%2A;5Ve`Ml<$k6@*z<HHMay|V&NWgm
zJq0%Xm5JV|qP*M({so+koUrqhlrPuKv7)`xgnU{43W3XYT&^RsT~(sIlz*kbD}-K&
z*9m;OC@=ZU5V({t`OEzSy)9tuBloMb1TOave-`$<Oz at F*zDnTIKc(KSLXJFt{a)ab
zkMuWlFM1Hl{!x at KLz)8NgZMG+lKZ{gS>m$21*l7b<jZ}I?DwmL-Q+p_YJnG!5CoDl
zTi{nHRO50y_6c0>e|{8#%k$ASLjI+qyu?oscv!n6AK6~XC)0f+{x2LA?her__dnMP
z{&L?a at mGYL>qPlX_KmW9t&k(zCC?+$4j&Qa-$mN^o3xL-Pm}v|c`lIs6?Pt%=WaQV
z%@OUb5qz!}c$2_y5V#zVw7#5ts4ghyE6M+KAxH9;=N)+;DCc3xNAj0^<oQ(Y6XbdM
zIdCv`o+9}CR^ZnQJTBTR_wCgJm-kmS0>4Yh3A?|VE6U6Bpxi%PEab?0(VIhXX at 7ZN
zR4B?zyjbAUPoy8p`(Y`+R`4Go<kSgV`sa%%W9%>cw_cQ=A>_#YgS>B{cgl>MTSfVK
z0+;jNJ3_DIKVOtTN64YGjFB(>N8+c6@^ap55%_6BPOre_{2<5CKT*c$EfD49dMf=y
zu19&Iyu`!SuQ0uGJ(c#C_L1x97@=1_cRN+!@*YX<C!a?d)81*K{EfnH*9rWuqWl8_
zmvS}>{3api*8-Pv{=_)VVL!l+;d2n`QlNEqFMbS8V~_&jRL0=crxXY$S!RDudo2ot
zOP&u3T+UZN5x6|R3<|svX`}a0QieeCWqVH)xNPr50+)Jch2S>{{1TzJTi{a87X&WP
zcRv#NWFcphz@?uI30#){Z-LA5pNJ|m1ng|ivs7&eKgMpiiSp8JpBK2a+iJ%Ba-I_9
zrJR=pF6A6f9fjcUuTumr%U>#R(rwO-iv%w9-X(A;=LZ6payAQG`sZ$eOaDJeoHv_9
zKVBtpl5hOq{2eP2N4PnME)?bEI=)oklE0j{&k}l1hfE42{|JE}sC~u9m2 at p%-ZRVn
zs$4hazC!L>rVBZ;y>cFI7W_{U<>kIHA at CMa{!;>%{d<GJTSfU+fy?qU1%9(A-ym>V
z{xX5LiSly<F3Xn*yj_%+_L1eyb6e`)4pF{b$dTpc`qC-NUl>w8mvNG at K$JKBM)*ej
zn0cd6lz&>_)dJrn at NR+A+GOOc48aQoPP$Df;{;)O21H!VX~K;!5!6K=2!xwG0DWIg
z9|(khSf%twKc@*lmeUjsa++{s69vIm6bPpu^LOtG1a8haRF3XoDNtE+W~A at 5m!@Fk
zn7@<PC2+Inp>nGQZqCSr_Y2(YVF(`-IO#BdZ%--2cca(LQw0PFyo=|fGJ%`D4V9}F
zI3>;B;p_6@=I`*W7I*;(;LtB{N)EE#L4l7jg(>Fsg|Z6;ULf$(1YRa^GiOqEwZP91
z<+}uarodMV+{}%X-7oNwqWqx1&77>VX+R?o|Fcy}e-sGZ>^Ui?OyFjW6J9NFbJifd
zOW@{gNcd`jpGN?Peu0}iBFY*Rc+3>0nEQwL7Yn>V;AU at 3*<}JZYX{-g0v}5Nhc1De
zIgqke3w*pOjJRLm#7Vsjj%d(_oBLmrPi5)H{C&8BL+}N`;4ArinGavh->U^aQCC%;
zT>>{}D9SMPNWPhq3GWx>DbIW#6!=9{(1b&Eg36W|3~_<LFA;c|z$Xd3THuof-X(BX
z;Hw2bMd1Aczf|CZ0yk at x%I1ZK_*aPX1p+r~D`lI%TS4XN$JnJ>l)sz`a_ACxr2!GI
z7Wh<w_Y2&d#VC7F;8%$9=1+nWmnwl59Hzezex<<61a8)P%B~i;Ir|gdC2+GRA$+yK
zXA!`mU*P5rnX(22ezhq~@!|Rl at tm!b>XUvSlLFz_sFeOF6Zo|PuNJsDt5SBCz|CHW
z at YMppo&XO00yld$${G}S+!Utx2>pe4R_mntED(5&z{><aSK!qGuN8Qg!0QCQTHy5p
z?-zK3zy}3BPvGVkG>HFvffpR9zYv}fc$vU&6nM43ZxVQyz#9d=THxkxld}5--Ym)w
z3cN+&`afaReq#30lwEL?E=2rq)=BkQCh#_aR|~vd;9Uam5cq0=cM7~;;0pvkDDYbZ
zZvI3s at mwhIf)D90gqyoHLdpdGNnJ#JRtwymK`5h3;EP20)dF8E at aF`6o4~gVyi4Fm
z at +O#gE)n=Bfm5Et^KYfVZx`ho1>P<2I|aT};13CWnZTbD_#Fb at F7VF?{K$O$g*f&I
ze3Za at 1zsue<pOUMc%Q)U6!@J2e at Ngf1pb`BKP&L<0>4Y(NAh8V_^%ZBD1m=Y;FSXZ
zyucd;ez(By6!<*?e at Ni>3j8^Ne?j2e1^z{WA9;-aLj3O&_$YzjFYro%e at Wnt0{^nW
z?-ckdfj=bh2L%3{z`r8!?E+sd at FPF0zYzbg3Vf8nzb5cXf&ZJp8wLJ#f!`_cZwUM$
zfqzrr&k3A374W#*F7R)O@<$%4zfjpV0v{#te;0VAz`rf<MuC4v;CBlAA%Q<6 at b3!z
zIe~vq;M)cMeSsf&oc==m*9v@;!2d(wl>+~vz#9erBZ1#3 at P`HdkidT|@aF{nh`_fC
ze4W6LJYIhx{y!1;D1kpJ at JfOIRN##QUoY at G1^$@89}@WE0)I~6PY8Uwz=>-a`_+*j
z(S-=3pDxBn`S5<mD}8tY?`s=<xcR$rclz)$F8`3gpV1W5=g)JV&-q2pK+Z37a$m^#
zRnB2A=KLn-*ExqDapX%mM|~*&w>iJdc{%5wIlFRxpYwLkJ2_i(26O(HvnS_YIj`jW
zDd+go37<IsFFCK~eDw94uE>(e?UC-t(#W#NXCfE(Mpi_YNA8S#F>+tzfyh at PUy6J-
z^3}-7$mb$=NB%AH^~et*{}K5{<eQOiMb<>V9r^djcOpNGJQVqE<a?3tN7hCjj{G?C
zNaWGTPa^9gKaH%9+!J{$@_6Kl$fx at w8zN6eeinH;@>FDFWK(2wWP8ptk^hW57x{VQ
z`N(e~TOuz-ejE8^<b}wuBEO5g9C<OaE$6Q}|CRH%oWJM%KC(6PV9p;SgONW+UXA=E
z@>=Be$WUZk<WG^mM*b_ZJ at U868<8D3e~;{l{3G&a<gLig$Uh?=>(AMc^JLD?a-Pch
zN6t{r&YWi>r|gP2qw8uqYDUMey>?nG7LPYJE at +B3Cg#?TYiRB4NHmV0SKnOUmZ*)5
zy}2_{dsBQ~eMh{lrL(!tUEszRI-}d_8*4^))GzFSglU)CB*bEqZ>_0I)J|z^sp+Vw
zY#LYMy5k#a+B at Pk?G<&Eb7PR?&WtbES6RgePR!&fv2pQGMz+S4CYn3y+nQ?{CrqiD
z(b3XYGq1j)a at vfd__!%mGjDCJcZ*!N)Lq!x(p=x%F{Y_zp^J7JbEOCzQ`_3=NsCQr
z?rb8nqbr&c9qsW%Q)^>I6H#-Gl=PPO#DU!O`qm1PkwUK6WvZ7zM at 60My2UZ-;QEEF
zjfvWXN+{;U`T*!FO3|mRJ-kk1sf-;<^Y3h)o2Y58ucL<SBY$gW7Q1e2?E51ceqt3p
ztOaBThY>D?4dQdrztZ!p_Dm~{eEGmhBV!hmxc0=n14r;civ-Kk14=LhX(clf^P0yc
z+T*P))N7bnQ)UXDna3<F9bZ{oRNGiz(-ub>&R$#J+L35!R$ZMkTy6C;>F!vJs3aN^
zwKdIku`%RE at tVfQI0jWvNm{3}&O!D$VNA{3;hNswUSBzZW~!!zwe4-`W~WOM^D3Ho
z>8YGs<TknEVcbvFx3#3Jsn<ZyQl)Ahol)PEZW=wX^6e%H)v-^VO at 8k3Y*}J8%|3~h
z%ijSMOLK>G%>yW!CgS%=w6yV<rbKhV`}W(!MygpgQUqqP!_eG}l+=MuV#e0Awbk5;
z4M`I=BxC2+&$BcpwH8lW-cSnLFTqA7m~Vr&**9_M&3zG<@j#CiV<y4Q7*kPmO}3;%
zwAhFH`=Keg|D#pseQwCUsR-M`h8SxM+L35(h_|#IxaI_{6{EJ0x7IW!n&%xjO(Bi4
zYO at bg%W$)<zQHz)@5GU2K}{n~PWva at -U=x{WshPV;5bI{*#rmCjyUtmyq-Q|WLMXl
zqe8}K&}M3Ews+2LZfR at MmP^ktoff<G(r8MV?=3zMdPp!!|Ag>X?2~M{{-?^$*tbGC
z)A_oNHg*ZOc0nR5f3?aau9?d8w&>0E^8(Ab)swyc&Tb}Cvt`ygGWJ{&k9ji7CWMH$
zyKI<>ee$Q2EtHM;K1r2cIwn)8wtKA!dIm^*vi|+F=md0HL!vp+p|tGZKuU4?4%|U>
zw8Y!>MOM6nF0is$2CT!1y389^P4Byg8{ZENDOL*6kbcUyc}%Q`FIed|)VqIlaZwhh
zjrN54=9;;U_3=bQ#q64z_BO)Ym~sHRPNiK(YYP_a_=1X=aksIfz0ItcGseZ~f at R9|
zizm&jxMEto(w(O7UMZu*bnS4YgU9CLj+Tm!G~D#gtD}BiTTP>8)eKgOH_>jdX{rZ2
zV{Ck`>*8~Ns5E^bRBSWKFt_4i*EN-v8}zUpR^W&=rk*a`eRr at nVse!U8;vQ)+c8b;
za}SJUjFI$`hhG{))|KFf%XdxPRMXs%Xm76xG-RLD*_W|lRAxK_XSfBA6|1W)(Ww2?
zU~f=R%W#u{?<Q=R<9n>+5_S1*wv at FCrp+jcdzVXYmYo!)iQH{JH3cWQ+66P?G{gCa
zn$M3yDq=qJS4<tF!FrR5(^T=<J>wCQt>Ca-X>jaiD>;O+-6^vb9!7gO>1Hsc_KnZb
zVQz=rjOMssQsBJ4mS#D(TW1n`qhrOjZS^%BSaoX}JL_p<p$;!@M~5 at Iwxy{F4<?<a
z`lj~!j<BU`3|+TFjC#_bpE9`Nh3T<G#)Y-ywwgqHJwLyI<kUxv!*)|v&NU%<QJ1tr
zHrm3R!oT!Qswh at -p11AAltS&=U!O_tqdUldAaNd^#kV?mfZyDjXo!s)jqPEU4YGPt
zaMXu+QB%`!`X)4WHg+U#X=$q)+de<hfM+oDz<I*tL|c6=J*jeIu6iJ&?vc%&moJxR
z#^SSQ7Go3ZV#m6FT8a*YmT|$^JF(F96HB{N=sH$U2yFXO%chDBDLmX79&A8cd-e at z
z4A$9~AsTcs+wg>W*;KxVpa;&g+-aB3YHV`jwT)zpebkn*hox$hz4d-Ku!vi at KRUB*
zrcI~sVRq0d8)pQZa##loo0YLtryNKEvoiK;qRV}}GT#p*XXPfEmElCqEgkivFP%2)
z-1d%|+M9-7R%5rc;Ub40WZyX4WSYf&u7777X!?z4`T<d9*@lNHv(9;0G%9K58_CX~
zzh4r;JRmXTRSW-qCK1-9-toJ#xRlOt=>Lw+cD?h8H?*}hnN3cp1gJ{c4TTphE|D8D
z+z`dv>aoFTtFLp1t0bjoS}BIjVZ6Srt+^#ljj=8js+j=>S#08l8NlALG2m1Qc<=$y
zLVjS*s$p98%h6IUKojJ;;JsfGQ(V_bRBvw9)YY}&fh1eMem<NI7jAG@|LWpx8D0H|
zY}(~xPLId12G(U-7;he- at NnH6oZ83XMNzuSYMSeeZtrO8sG<1QCKb1}(2J;93Y=72
z(V<=+&AwjKKGsX5ucy}za}%o#OibxL*I|jKR;+`sRA7<Hz7k&L%&L+{Q72U8j6Agd
zX6)Ky@!>|WcL>6W#ZEp$*H0tZHv+U^xwEmZ=C%uXn7D3cv*o&-FReo_+ua1;T~{W+
zX8#x{vmYoH{zYtpZ>$gJfpn^f#cJ__cSoaoL#_grv7CI+V0Drck0;cl>A859(|a7@
zS!nv at xakjDGb>BlZ*J?TsDsa1dZ7j(T~7?3Uxeo(_3a(FgTx0OI3U3tN2G?Ezw~G<
zsl~hM`q4}n#o-CW-Qn{=a34w4 at o=@;ZFr^c2Ptp&)6$l1R*1=0?C3dA1|5;9c#OuN
zc>kc_eJ8eC>WZSZ at zyxjPI?F4VodU%neob|(v*%WiNlrfFU^eB+R}JyJ?`GJmmL@}
zG&(Cw at fL18>62Jg{3DUZOifMw%|seMMRDy4#q=b;qb;!@fme8Mjo0Eu;f}UW+{54j
zW5tXy at WQd`ijYgke5G4&r6-YD6vQ}Vi7rXk&m1uoz+(R1 at HMPXJ=w!Wk(gfo*Egz3
z`PTnqjq31bp#N<R8)o8<jm29~@mmrd^W$?{+S*!fp-q@&TF?W<jnO0T&SrWK5WjMw
z9>fcWhwJNq(-DPC6~R^4G606rW_BV<L-;NTd#dS|qaC*OJWW-++q$n+o9ok5wQe-W
z*VT8_BpTa`=~qN>S=^4UXlbNLt1dw|#}#;eVD8NLjN&kx#)_#ETNc!}H8koTq~fvo
zbo}@ho79U{i*=e3?iImmsh3j0!%iv#7$KbqzH&+~7?;)Qpmtk<_d$kPl_s}h*7vnC
zm2Q}>4^Gtwrs;!H^a1JlVASlR7F;xBe$#+$TGbVm*Tg4X;ZB({BNKPU(PG%&w!~G-
zxccVWmb!Y(*ClcPHK=<abYOe=zU!c?Yv0QaxHd|CkKzEeCA}X1W{tPb`uERD=$ey$
z`;4wS>*ICx?X_)*R=hyyPN!|Dw`Y%)&MqB4gUU=!776SFQ$&x`H{*8USjV!K9xI|>
zY}OZ<S-C>UQ|90!Ed6^(SYq`f_H4Q+M2G#E{s*NeR)k+!j`M5n2eMf~)2a7v52Oma
z<@R08Fn3(Z`G7m#I{7Ft<=mT%e}t(|nT|eq_338Z51`SOdH(&;>Tm1&+m~pTUxB(I
zr$4livX at hH&6uhyX3(5AT#;$+rNd5p$!F>~(loa=*3{PH7qgpNS}UriG$BlHa at BAA
z>9;ko!<-q%B(L_R{LTh0a%SVb4Qw9hv``wS_k8&!j^TV=C=MRINE>Sms4Iqax;|6t
zN}!Gar9VS;DOa8P*BRY}I_c1v%)E(Wv<xTIa%{ko?H9!=ypF1E?<kdDN}SP^ld39;
z;+%+;md3{zj1z<v8;lc##SF#?!iqE&8|Nj9fbDNsc((VaPg6vVdRl+oIVlBpa#FuK
zx1ZuD><}Mb#^0;>@4T37{_|JntFsvO55l)H57CKg{-a-|l!>qfis|pt7;z^k6n|qF
z{D>3(StmyEH?+8*-$JIoi61*P_3!)|0lEA~2Kmp`DgDnOlj(Tz;5poWA}2yWD>z?Z
zl=Qd%Wz<(Pef96-KT2l!Q~SxrhPP>d82 at 7}{C|%6e*Tx??|G5^4)N)C4g-iVN=M$O
zpTCRxHvXF}{7Gb{{0o at BDNpTx9`*hFmyEOXFJk`a-jv|yA7;Oi7W at 6eqM!aQqRoC)
ztiLctd$9jMh59!8?PmL#ex~-*pYAjEFnAdM5)1zU3;$C>_;)e?>(lT*A%uTE|M9$q
zT%Y*UpN`9vf0>2<FD?A9!rx=G+28z~(#z8DzY_Ir_U~c-a{SSsF3XgErG at _s7XJ5r
z%+9~)a7}zk8vb{qzK#Dn=D&vPll|#WZDq=TwuS$%P~SiP*1;)l_BVfj^`tcXe~9`v
z{?&KtipI~0|A|@fue0!f(ZatTf6vUu-~64}v(oUdMtvLqg%<u;>N2)}tA+n>Ed1|3
z&(1&p2u+;+j|Cb0^WQzFZ{xp``QOL&$^Hcpl&SwLvhe>k>ihlYVk~N;!oU7hVP2sy
zT1VcepFjOSi#GlvKdY-q|2Yj%ru=)DzbQ}ce+l*d{7(w;e_9tPd{jr?r=Ne={Jqj*
zzb%mB*MHy1cKfYn{prTv-KcM~Uzx>zBcb2e!`L;9|0)arH!b|n58>aRA^&k9{AXME
zpOXduHO$|%m-_!L3;*$E|Lfa74KjcJbDgNEL#F+|)?&ZkLWIBnAH+Ig8$S-#H42yO
z$our`{}Srk`oER!SHv}`{ggKLNZ0 at CE%y5z>ihYh8nXYaV*dQ+HbeP``QK9(`@L+@
zA4dn-?AOZrt31s*;kVzlsBg31YPMfB*C+dpMLJXa4Or~=Pt^C at Z+q0v-~9dReVu={
zSnRjUqW`T=*!7#glRc8#Zz8|_-avhu{hYgW4dZ8IKNQW_ey>{i{~q=I_S=Yqqpkn*
zKcw}iJ3nkdeH;H1Ec`FXg8vQ+|36sxKY;_LjeimIFVW5LKK=H86!mTVi<o~A*ChLw
zA)TrJ?y>mq+o<oi|7+M(+xWLK|9zc554v6x=V at 4--*;Q|{|pXl)4!7SkI-VgPrv;h
zM}3?9sxAHRLQ1Cg%V++kJlX%hE&LZ?gKFd7&;0q%{f6=nn?Fym=zqte|36_6oBl!8
z at 3)u#dzk(Li~e^l`hR++UH^{#Z2w4${;d}ME71A2_UGqmlke;NTg3XM{|;L8-%@7R
zU&Q*;t^aMPZ}Z=UmhoQ&{l*{7yd2hl6D|DzXyN|?1lsJ^#r#h}9U1)o^K;a<@n6II
z4Nn?BGax8a{^b__dr)8L67jT)?fh3W|8(ca3e>mp at 3-*3DhvKInZIc-wf|p&w`oUz
z$o^-5`O}?^41W8E_5U1;{r(I=Mz4tv!2%0vf%D%E){o(x68!peP~T?1Ef)J-2T8^r
z#xKJ7&$rm`J=8b6O?*=b{{kj at atiz4{a={=R*U^!vFM+PbH2 at fWvrjYcr5((y8`uX
z_S?bsTgWxZf2xtr)P9RB{Qrdd#%?A)0v%!FznX~_ppFcF{)eExjsN_Wx|Zo@;$I6W
zQ~o^`{(nJz!`sBSUu5Us&qS9<@<{vnFGhVE|1RcV&h?3Z1JarDzsJJ=Rn#}UO}wJW
z&cBKW{uQj>M1KA~sBh!H*}^}O1^?9+{;yg1UvQ3{|7zwRGsHRe^Djkx8~^;z=?c<+
z8UbZ$|A#F6Uq^jocP~EA&VP{kUq*vN1V8`1qwV}hGXE~FPyW*aIhpcbXW_pW^*!Fq
z={q6zFXD~+c)^{Le*WJKvHzOS>xxBOpZK>SohkpPEc|Jr@$-ND2)q4Tng6w{-$Z`?
zKS6yZ(|7-~h50WO{5!MYKVadXW8uFY8$`f?^>cvvr+fbKI_lf_SKY0NOZ!t>GPVC!
z=5NYV|3xhPFTn=c#=n3bwji9OBk$91|FHG*Rg3+i7X4Kr_h)6S|4L7 at PWbg-9&&%y
zWwGC4h%ojr^GX>19TxrvS@?es^MuWQtC at c~|NVE=xAp&8<}dxXD+~U6Ec_3)@LzSI
zoxl0daj(&4_dfmh|03$!_>a6tSCRa?v*4e5gRU>nUx!%uKR?ROzxp^`|9%~LpML(&
zqP~s)YUbb1HEI0Zfpn(+bF78`p%(u1ObKvc{&4u>Al>-;CF<JvSKo^a1mow#zZc&#
z<zHaopKIa&3JBWx7i7qPE9%?$FSPK#GYkHsEc}nN at b3xXUzQ>N?hyVfE&ORqpQ-&z
znZGGd{dc^D|37fRvDv?x`SUP<W*z+gA2$D$S at a)f(f<i_o=tyOhV~27UvBBYqb&M!
z%Ix}Av;Ip=!$aD?CuIM>hWoFG>yiDxfVAlkGp at q;&$RGA+`_*V_t!T2?O^_=rs01R
z>f6T87K{JUT$m~UY774(Ec~xN-_AduUUoo8xBr}u`ZoSMEcRaod71KWwD3RD!v7Ws
zwAsIk`KPmg8|vHm7m%|c7(XHZc at W<-<-gFv|05Rucj2Lojejfi$M!xY_{aZp)VJ|3
zwebI17W|i5_ at 7|m|F<$b|JBTYWD5J>`DF<8ZTzb&{OSLF%+&rXnZGGd{dbCm|5EJp
zZ1x{u{(gJ=zlY5q_gVCR%%Z<AME at Y`$M!NMWU7C)rT;!^(SJQU$ku-2_#;xZ2m5a}
z>f8EnzNP;jf-KV?W?Y5Y?;(r*PPOnqJ>>aK5i2-84gWCxKeFgQ(W3trbcoG<Rjl9N
zU;gi5`qx|ZpJdUWzy+^OfAxO0|0#?AoNUp*>U6vQF4liq8vB0{^=<ysW$~Yfv+$n*
zi~R~L{BzE*^IyySalB3mnfBini~T-i(f>xs`FVi#r#pXc3pqdE$M##u>}mdc6zNRI
z->Vk>BP{%XfO!gV;QUj>hJ$%ig5Q4h|CieOKmUum8k<2qe|`*5ru=tU_~%*pAB+Xo
z#=n~RuSsDaoc|ouxA7mz{L8sM*}p#v{(CI^^DX=z17_o&f1*~*+c0Jk<iF0y^xeO;
zGJoPsf%yLnai;dqjjL+TkvdZKk2bo1DVrg#M+e*ZuVwyoxW0+}{zLPe&HhFA=^E1h
z8v$j?|5ywEV=VlCezu)|KF*aCXw8$s&wn%O+xRcE at P7t^GUZ=j;s0R^|C4dgX|sPn
z^S at 4#N7~Q-BdBlV-*4gnTo(LCS@@r3;XgUV{ssKturCe&i$d(5ONT84=|9iod#3g;
zW&Wl-ji1vk{5wb3?Z1}!pQ2bKKm7I&+dq_9^v5jv+koL;VE!Ip{rnoAM<7%E<re+L
z7X2?CX4gN+`i=bzflT#RS^Vb=i~hf1-m|rT2kSrADCXF&e+cz${$IuZQ^fVC|F<A*
z+H(TqVf{D9!v9PQ|2)i-HvR?dIB at Tj;OBoh>f88tF at M98_`eKEnev}+;s0?9|ARw*
zf1rx_e>{bKaQ<;Zet+OT3;#c4!N0 at 8f24)~Ga>xDm_NIb*9yP=p9<l>*24dlEckaZ
zf74&oe`j0xpD at bqKdYHPFT<hy!~D0$V!yL2`k%_P>+fg%r+S)o!f(GPP~Yain=STx
zEereIW8r^}h5xx>^H+xaN20!s|Dc8cwk-Isw)FoP3;(Y~?e=#*rtQt!0R07 at e*1^@
z{~C+^=yz#UTOr59_k^6k3RwRLPqR+=_1}g1Hv8?i*zXO9Fyq0 at YhnC<WZ^%`!ha{;
zlLQ=CKdYGk)HMA6j`}wK`Crl%tGPan-+v&TDgS;;|DS8&KM3VE{;T)1|2JFgH`=2A
zKM%3%?`Qqp+<p`J`+p<q+w9lM_A3(h+m(g=UbOH(&%%E<HV8KRRh_C8ryD<Sp}vj(
zYUXeJl=}ajEcg#v_!n9DCr-EXZ)N_I*?uPS+rJL=ZT#0+`0vSr|90kY{GHlA*24e9
z!|nY0nLmbCO7Qa!8~-~k`o~%HFUI`^;K2AFVEuT!lM*u3zlZh9^YeI%{@-6>*S~}H
ze>z2baQ at zc`nLWXu#6vi|HJf$@yjs#<yLFOrablECoKG%$J+U~7U=pVI`Td<wO at X<
zt|IkMu;~Bl2)q8Z1-d>j<N6CeGu3}WwXP!juhgRdy9IXr1HdUXppFcF{c8&B{pWmH
z*IL7s$^P{G(%2(i|D9f~D at y+7TloJI7raE#|NEz9g}VMi9eJOA{yR|L=6`FLe-YOt
z{`CAfQ~pKOx}qsh^(W#-=|Yx?KNP~hpZW7L#4IxJ|5Da3?RTt2|J38`_8VmVv$4L(
z;J05n>f7wMh3&VS8%ph``zvFQbp1cs!v6vb|6hlU-?Fnb>t&KW(tiE}sBhyxuv%A?
z{ZH?IWXiv)TGyBMztF;e?ge)H4>130v>5Nx&;NSVxAAwrs;e0PC;Oj}1^+qKy1r>I
z)xQWo+?`&$Dq`nf#0UR<eg0L)`lbEKEc%}fv0oMIPiMcUL+rQkt3mtG^IKz&bpG3F
zvERiO{)-N_+ixxN$MT*M{QV!c{w-wv(tej%^dE|G4mdD>4zT`7DcXbMCxZGm|J(6Z
zT~UsoLMY1AeoHO<Ct3KvJI>C(0MA7z6zj<Q^xN;BsBhz6 at HJh<^fUQC-QQ)(e`U3<
zFYWJI_+K8vznb}TH-z$^62iaK!k_MsGUdO@!hf=be|`x6mCS#+$6P1;_CF$oe--m*
z8tVBC-G5}te~pFz6bt`fjR^6dQJQZ$|NjN*+x%xO^Oycl=g&;}|H#7sQVah-pJ?ab
z%KX#qAO3*)HvWSa|DpT)O!@a)_?KJwuQ<ide<ky;)7{{G`u(Q|^=<rjTl|Oae=_C&
zoQ3~(uu!^0d{oH$cMf0NU+*#32|xebkoWIq|C=r^*Kaz1Wy*g`wXT1pj at 0<6u-Lx<
zga8NT|7zyX$2gV}oc~WkeVhFUn7^Dq>HL)`|5vNEO3A;{!k^v=u<`H8kiYj%NyKZS
zN(@=})BZD4{yQxEr&{=L4O#zJGyj5AT7muROQ>(Nzw>onQQDvOPnq)HW8pu|!oPTu
z-TwW|zcda1b5Y;M{{#zvTK_ZUpIf69OaHmT!ap|N&fomuqSMmwABFlh{zVr4wEkqu
z|5ywEt1SGRFR}A48m)<^+dm{w-^Rb3`4 at 5fY5zm>N2dG>Ec|C%`2X`_JO66tPj`+o
z_~*|ZsBh!HkolK$W#UisN2dHo)#!@Sf38M-r3+am{ueHY00-yK^K|_t9eJOA{<~4%
z#=oEWOZ$`mX3D>m`J3|8{%i1Kc$@gikp0^L^B?6g*9kxWu=7V5>zDScvgn_6lHGpl
zj}<{>VH*8asBg314z{11|7JqJv4_FK_*Yu^Uuoe#WrUr75qDg=`Qu{LxAD*YhHkjw
zN&QFumnr|*HM+jE|8&$h{chq9kFfKv&XE6yP~XPCz{3Ap=+BgYT}_bx4AeKgO?+vA
zo&QSapYHzh64bZxFJ=B~G!3<Xx;_j3trq^&e#6_uOW+`Y1M`PDxGrS<Ci0KJMW}D%
zKgj&q42pks7W@}6f74&&f7jv1 at HX*k3{b!U{#|_V#`ZQP`1yy8zoo2S`Y(-Nqu0cd
z<L&xavwn^|4u1Xr3c0_`|E8`Y$6r148+#Z$jQ>gt|5+CPIcM7W4>Iu)9&?@W^WSrZ
zz5hpA_|MOR|0)ZAdj4<hZsPMp`age+=KHBM{Kuibk`+lUSjPP2_`4|!{%b7!|7zji
zg$HyH8}Q$i%pc23O7PqNQ>bs_KV-3ga~Ax6#QcrF(fIo>Ffn#F at z+0U=f9TuBReJd
z`G@)Mde$$;-`_0yCtPUPKfwCa-M^NgzRiBlw=``t|55wf at nh_fZv1Vw@PEU?|4N)w
zAU4qdMf~9Jeoe#s^xLlz^=<qYGJom63$oz<qJ{t8E&T5SX5&A={L}d_-Ot(h7cg;S
zXR`mTS at 0jU@Tc|D_^pYf7uosmVE)~n<#obu|Mx=tr<D1ZbA95!C=34EE&S(L_>V5K
z^Y6lQB?@KgGx_1?e-`T7?B8YKe_Iy(cU$=1VBx<2_q2cm{@>61`55Lkz|a3?)VJ~9
zZ1JDlv*3TwT&+^hKXD8HbFpsN`0rr;_j=5A!q0ys>f87aGXH+APyVwk3;y{Q{?!)#
zFJge%_^-sdjKbAA@;?3ipGSQg|JDDlt4ROv$%6mM7XCFB{`ca7!Nz|F^Z$XzTqpee
zSE9a+|B!`$Ul#mFTKLym_?JiR{MX`MhXSVel;Gz-3H5FKi at vR^6>)9ypU-B&zr at 17
z&cgrJko*5Z=Kt{&_QCtxj*$ERYUVHJ|IcN?zs$m)_Rq>z;AG;0T)X}AOEvLy?=PQ(
z`ZoJ7wD7+t3;vbNAL3P@{-d*`;ceoZN7?xoF at KmhCHVa(Z2vXWV!wG7{oRMz^;fa}
zbock4Mtz(8R$A<Le-`$uw(zI(hq0T9ug$abZ)N^fDb@@6?<~}}@n6IItGPb)->NM5
zH(L5XVc|deU_1YQ=D)A|?+%Oo=34YmLi+#**6%^q|6TBt!S8?AK3VOT`yE}YiaUzz
z_f`Cu at o)kQ4fEeF3;!D}{NI~s=f9GOA*4G$yo36-{-6DQU4NmD)cwcTv*3THg+HA?
zjo+I1|10hMU)-pI0KS(yuBEnKT2FeS=%OM~a82~?Xh9 at ryVrnK!j%Jmu;h|7si#Rc
zCY!1S>B(trDB}I#5B>pyPy~;b!hvEVJ)tPdA@~LA5B?(L2Za>+!y!k{nSC?4^<|c2
z at xj}9Z{EzC&(3?B?CzE3SGd4HJ&^w%@h?z(7JB}F0)L5rgZSAW^Z!`{{!aw{iopK>
zJRnH?S-3C9688LX5DX;#<f`)LeS-X#BJh7o{Jh`LzY3M}&hpwmnZHi__bQD3QO|$S
z{{39YA3ncxT$UdOynkAu_+js#E`Yx`SFDfdH8Mi}zKD>&HG#heGcz|!7w!{a+JXJM
zLHzIk7i-|}j}G`t{0qdNB!67L%Mtk31%3zonVTiv=SchuX%+Zv2>;Ch{uP1$Y6Sib
zfxj;BzW|@p68|5>Uk~A*1%E039f5x>0{`y<{~H4T)d2qNi~=u*@UI5&_XPegBk*?x
z{)WK+;9)uc9pZl_g#SM9m-27YgHMwDasPiEf&aF^kMF;D-?Ka)z;8aU0?&o;zZ<}x
z75Kl2!0-KMy%5{~{O$_;e+Td{5q~CxzZ<~6B=E0C;NM$Rf&Je<ErI_?O5Q&k#2 at zk
zvkW(IFzvwaA9l$<uAM)xJIkQ&uW{n{=l>SW#eOU&4$Axn-dIApqD23w^V|de-dwT1
ziPu=A{I8H2_y2bh@^1?H at 4{Tn&GOQW;*V+ZuMj`x)?f7gg(t}0Zv^cQ`DZn~e*QuE
zPYC?CVJ_xo`9lDI{HO}7^XmPe_(>r?zJKPpEO9=p!&?5vDL&gNnjL at cZ_&TsOR|FG
z1o<a4J at d0X0e`>Zw%;^O{(c>{b=#Ydd%C)YZ?OLj*q{Ew*E8#W0e?KlG%HA^FRRAd
zekF1L{tT7<C&=yw_zxf7cA&dNDo<=)SG`FBMt>|MR0obru>W*+96Utpmv}^(qTlyW
z_4sc#^7VSDVL4BkUP&t9Yb(w4<fNxdO-((mOcwT>vyaclhIWXCjOPaUFnV?vkE*0S
zkv8rE%lS<E&&={)@%B4g7wxUKz1hXVi!W}rcSAq3ydFP!y<9F^V-LOOd9i(Y?Awk}
zZrgXtW9#-E!}C(sU^ZW9)!dd at tmJE_%`>G&bJ3}psY6!Edd5toTHa_1k`86uJ3>y+
z&KW~*#fR?SyKDCfoDRM-!iwclJj~o3KOZ}?Z!`x6(8aN5Q4i%|Jn;BQHI6E&;Jcg=
z<JPt`HY}{-F)%#f(|atSdkm`nIf8TcprSorOCB5OK7uOQuY!KRw|k)hWkd<%N06xJ
zKCkcr8b>|Wt!-5 at KK-aqZ;~GOr+<&8H9hl=jQW0+*|3UEwOXpVhE- at b4c@2lQ_e;%
zS86nBPOjn<^DZo}VbvVBWF32H{>i4BFTQT*KdU`uqkaB_y=YjK%9&~|f9e!$jMEgZ
z^lGl$$X82-)pQ$fz5w#=DoTBx!+AGft1p&Q)ArHX)O2pZAeF2v7K#eb9vq$A8O=F`
z*KDIdA7$kKiF4rYyVUxV%~A6I>1$Mj*Ok?{cBn(?FKOz8-3y{G?hCdjH={>^c%Apa
z03zYFgoQr~@m>dY`Z;`zMo_d at QT_d?M%BF{hT=G^Xa58%ND@~6GYcN&`fb1ISN at K#
z&Hk)vK?8IH-z6O~c$p1J^7*s64h<=PCaoWz7v^slRQBim6+kC_mt at JGgn|B8bp8)O
U$2Oh;9VW=|H|k>-*GP~5FGLtLcmMzZ


>From 0adb7b7009b7d42e77efcdfb6e5af1f8d370485d Mon Sep 17 00:00:00 2001
From: Sukumarsawant <sawantsukumar at gmail.com>
Date: Sat, 14 Mar 2026 20:00:30 +0530
Subject: [PATCH 3/7] nit

---
 libc/src/math/generic/atanpif16.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/src/math/generic/atanpif16.cpp b/libc/src/math/generic/atanpif16.cpp
index 6682b886fa7a7..c60c0c2d75317 100644
--- a/libc/src/math/generic/atanpif16.cpp
+++ b/libc/src/math/generic/atanpif16.cpp
@@ -124,7 +124,8 @@ LLVM_LIBC_FUNCTION(float16, atanpif16, (float16 x)) {
     // clear the underflow raised by casting
     fputil::clear_except_if_required(FE_UNDERFLOW);
     int rounding = fputil::quick_get_round();
-    // values checked through exhaustive testing which rounded up/down and caused spurious or missing underflow
+    // values checked through exhaustive testing which rounded up/down and
+    // caused spurious or missing underflow
     bool except_value = (rounding == FE_UPWARD && xbits.uintval() == 0x0a48) ||
                         (rounding == FE_DOWNWARD && xbits.uintval() == 0x8a48);
 

>From 8ee18a6bbef2be6b554eb9cb6c3e44b91e6db71f Mon Sep 17 00:00:00 2001
From: Sukumarsawant <sawantsukumar at gmail.com>
Date: Sun, 15 Mar 2026 16:04:15 +0530
Subject: [PATCH 4/7] chore: removed clear_Except and reflecting requested
 changes

chore: removed clear_Except and reflecting requested changes

subnormal

nit
---
 libc/src/math/generic/atanpif16.cpp | 40 +++++++++++++++--------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/libc/src/math/generic/atanpif16.cpp b/libc/src/math/generic/atanpif16.cpp
index c60c0c2d75317..dfc58d3d8521c 100644
--- a/libc/src/math/generic/atanpif16.cpp
+++ b/libc/src/math/generic/atanpif16.cpp
@@ -89,14 +89,8 @@ LLVM_LIBC_FUNCTION(float16, atanpif16, (float16 x)) {
     return signed_result(0.5);
   }
 
-  if (LIBC_UNLIKELY(xbits.is_zero()))
-    return x;
-
   double x_abs = fputil::cast<double>(xbits.abs().get_val());
 
-  if (LIBC_UNLIKELY(x_abs == 1.0))
-    return signed_result(0.25);
-
   // evaluate atan(x)/pi using polynomial approximation, valid for |x| <= 0.5
   constexpr auto atanpi_eval = [](double x) -> double {
     // polynomial coefficients for atan(x)/pi taylor series
@@ -117,28 +111,36 @@ LLVM_LIBC_FUNCTION(float16, atanpif16, (float16 x)) {
                                 POLY_COEFFS[5], POLY_COEFFS[6], POLY_COEFFS[7]);
   };
 
-  // Case 1: |x| <= 0.5 - Direct polynomial evaluation
+ // Case 1: |x| <= 0.5 - Direct polynomial evaluation
   if (LIBC_LIKELY(x_abs <= 0.5)) {
+
+    if (LIBC_UNLIKELY(xbits.is_zero()))
+      return x;
+
+    if (LIBC_UNLIKELY(xbits.uintval() == 0x0a48 || xbits.uintval() == 0x8a48)) {
+      int rounding = fputil::quick_get_round();
+
+      if (rounding == FE_UPWARD && xbits.uintval() == 0x0a48)
+        return fputil::FPBits<float16>(uint16_t(0x0400)).get_val();
+
+      if (rounding == FE_DOWNWARD && xbits.uintval() == 0x8a48)
+        return fputil::FPBits<float16>(uint16_t(0x8400)).get_val();
+    }
+
     double result = atanpi_eval(x_abs);
     float16 s_result = signed_result(result);
-    // clear the underflow raised by casting
-    fputil::clear_except_if_required(FE_UNDERFLOW);
-    int rounding = fputil::quick_get_round();
-    // values checked through exhaustive testing which rounded up/down and
-    // caused spurious or missing underflow
-    bool except_value = (rounding == FE_UPWARD && xbits.uintval() == 0x0a48) ||
-                        (rounding == FE_DOWNWARD && xbits.uintval() == 0x8a48);
-
-    if (result != 0.0 && result < 0x1p-14 && !except_value)
+    if (FPBits(s_result).is_subnormal())
       fputil::raise_except_if_required(FE_UNDERFLOW);
-
     return s_result;
   }
 
-  // case 2: 0.5 < |x| <= 1 - use double-angle reduction
+  if (LIBC_UNLIKELY(x_abs == 1.0))
+    return signed_result(0.25);
+
+  // case 2: 0.5 < |x| < 1 - use double-angle reduction
   // atan(x) = 2 * atan(x / (1 + sqrt(1 + x^2)))
   // so atanpi(x) = 2 * atanpi(x') where x' = x / (1 + sqrt(1 + x^2))
-  if (x_abs <= 1.0) {
+  if (x_abs < 1.0) {
     double x_abs_sq = x_abs * x_abs;
     double sqrt_term = fputil::sqrt<double>(1.0 + x_abs_sq);
     double x_prime = x_abs / (1.0 + sqrt_term);

>From 3598ee762cd09a55d8a84410b275a1b9719a0749 Mon Sep 17 00:00:00 2001
From: Sukumarsawant <sawantsukumar at gmail.com>
Date: Sun, 15 Mar 2026 16:09:57 +0530
Subject: [PATCH 5/7] nit

---
 libc/src/math/generic/atanpif16.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/math/generic/atanpif16.cpp b/libc/src/math/generic/atanpif16.cpp
index dfc58d3d8521c..6d254df064004 100644
--- a/libc/src/math/generic/atanpif16.cpp
+++ b/libc/src/math/generic/atanpif16.cpp
@@ -111,7 +111,7 @@ LLVM_LIBC_FUNCTION(float16, atanpif16, (float16 x)) {
                                 POLY_COEFFS[5], POLY_COEFFS[6], POLY_COEFFS[7]);
   };
 
- // Case 1: |x| <= 0.5 - Direct polynomial evaluation
+  // Case 1: |x| <= 0.5 - Direct polynomial evaluation
   if (LIBC_LIKELY(x_abs <= 0.5)) {
 
     if (LIBC_UNLIKELY(xbits.is_zero()))

>From 0897d467235eed88ccfd1dbf54d46ce2548bedd6 Mon Sep 17 00:00:00 2001
From: Sukumarsawant <sawantsukumar at gmail.com>
Date: Mon, 16 Mar 2026 17:06:39 +0530
Subject: [PATCH 6/7] temporary

---
 libc/src/math/generic/atanpif16.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/libc/src/math/generic/atanpif16.cpp b/libc/src/math/generic/atanpif16.cpp
index 6d254df064004..52cff4606608c 100644
--- a/libc/src/math/generic/atanpif16.cpp
+++ b/libc/src/math/generic/atanpif16.cpp
@@ -117,20 +117,20 @@ LLVM_LIBC_FUNCTION(float16, atanpif16, (float16 x)) {
     if (LIBC_UNLIKELY(xbits.is_zero()))
       return x;
 
-    if (LIBC_UNLIKELY(xbits.uintval() == 0x0a48 || xbits.uintval() == 0x8a48)) {
+    if (LIBC_UNLIKELY(xbits.abs().uintval() == 0x0a48)) {
       int rounding = fputil::quick_get_round();
-
-      if (rounding == FE_UPWARD && xbits.uintval() == 0x0a48)
-        return fputil::FPBits<float16>(uint16_t(0x0400)).get_val();
-
-      if (rounding == FE_DOWNWARD && xbits.uintval() == 0x8a48)
-        return fputil::FPBits<float16>(uint16_t(0x8400)).get_val();
+      if (!is_neg) {
+        if (rounding == FE_UPWARD)
+          return fputil::cast<float16>(0x1p-14f);
+        return fputil::cast<float16>(0x1.ffd7ap-15f);
+      } else {
+        if (rounding == FE_DOWNWARD)
+          return fputil::cast<float16>(-0x1p-14f);
+        return fputil::cast<float16>(-0x1.ffd7ap-15f);
+      }
     }
-
     double result = atanpi_eval(x_abs);
     float16 s_result = signed_result(result);
-    if (FPBits(s_result).is_subnormal())
-      fputil::raise_except_if_required(FE_UNDERFLOW);
     return s_result;
   }
 

>From 94090ce38e001df95201e381dcc710452c07e4a8 Mon Sep 17 00:00:00 2001
From: Sukumarsawant <sawantsukumar at gmail.com>
Date: Tue, 17 Mar 2026 21:52:02 +0530
Subject: [PATCH 7/7] Added tests and resolved conflicts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Added tests and resolved conflicts

added a smoke test

[clang-format] Fix incorrect trailing comment and escaped newlines when AlignArrayOfStructures is enabled (#180305)

This change fixes how the spaces are modified during alignment.
Previously it was inconsistent whether the `StartOfTokenColumn` and
`PreviousEndOfTokenColumn` members of `WhitespaceManager::Change`s were
also updated when their `Spaces` member was changed to align tokens.

A new function has been added that properly maintains the relationship
between these members, and all places that directly modified `Spaces`
have been replaced with calls to this new function.

Fixes https://github.com/llvm/llvm-project/issues/138151. Fixes
https://github.com/llvm/llvm-project/issues/85937. Fixes
https://github.com/llvm/llvm-project/issues/53442. Tests have been added
to ensure they stay fixed.

Attribution Note - I have been authorized to contribute this change on
behalf of my company: ArenaNet LLC

libclc: Disable contract in trig reductions (#186432)

libclc: Remove attempt at subnormal flush from trig functions (#186424)

[clang-format]  Ignore imports in comments for Java import sorting (#177326)

Java source files can contain apparent import statements inside block
comments (e.g., showing a code example). These can get mixed up with
real import statements when run through clang-format.

This patch tracks block comments (/* ... */) so that we skip lines that
are inside them.

Fixes #176771

---------

Co-authored-by: Natalia Kokoromyti <knatalia at yost-cm-01-imme.stanford.edu>
Co-authored-by: owenca <owenpiano at gmail.com>

[lldb/test] Fix MTC dylib path for newer Darwin embedded devices (NFC)

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>

[clang-tidy] Fix virtual inheritance FP in misc-multiple-inheritance (#186103)

Avoid double-counting concrete bases introduced through virtual
inheritance in `misc-multiple-inheritance`.

As of AI-Usage: Gemini 3 is used for pre-commit reviewing.

Closes https://github.com/llvm/llvm-project/issues/186059

[SPIRV][NFC] Drop uses of BranchInst (#186514)

Also simplify the code to use successors().

[lldb][NativePDB] Require `target-windows` for MSVC test (#186578)

Fixes the failure on the lldb-remote-linux-win buildbot
(https://github.com/llvm/llvm-project/pull/186124#issuecomment-4060098881).

The test runs MSVC to produce an executable that only runs on Windows.

[lldb] Fix heap.py crashes on recent Darwin embedded targets

Two fixes for the ptr_refs/cstr_refs/find_variable heap commands:

1. Move the `task` variable declaration into the common expression
  preamble. Previously it was only declared inside the `search_heap`
  code path, causing compilation errors when using `--ignore-heap`
  with stack or segment scanning.

2. On recent iOS, some shared cache __DATA_CONST pages are remapped to
  non-accessible at runtime, even though the Mach-O section metadata
  still marks them as readable. The segment scan would crash with
  EXC_BAD_ACCESS when reading these pages. Fix by querying actual
  VM region permissions via SBProcess.GetMemoryRegionInfo() and
  splitting sections at region boundaries to only scan readable
  portions.

rdar://172543652

Signed-off-by: Med Ismail Bennani <ismail at bennani.ma>

[Transforms][NFC] Drop uses of BranchInst in headers (#186580)

Replace BranchInst with CondBrInst/UncondBrInst/Instruction in headers
and handle the related fall out.

The removed code in simplifyUncondBranch was made dead in
0895b836d74ed333468ddece2102140494eb33b6, where FoldBranchToCommonDest
was changed to only handle conditional branches.

[Transforms/Utils][NFC] Drop uses of BranchInst (#186586)

[x86][GlobalISel] Select MOV32ri64 for unsigned 32-bit i64 constants (#185182)

x86 GlobalISel currently selects `MOV64ri32` for signed 32-bit `i64`
constants and falls back to `MOV64ri` otherwise.
That misses the unsigned 32-bit case, where `MOV32ri64` is a better
match.
FastISel already handles this case by using `MOV32ri64` for
zero-extended
32-bit values.
Update `X86InstructionSelector::selectConstant()` to select `MOV32ri64`
for `i64` constants that fit in `uint32_t`, while keeping `MOV64ri32`
for signed 32-bit values and `MOV64ri` for larger constants.
This reduces the encoding size for these constants and fixes the
`0xffffffff` boundary case to use the correct zero-extending move.

[X86] apply mulx optimization for two-wide mul instruction (mull, mulq) (#185127)

References: https://github.com/llvm/llvm-project/pull/184462

In the discussion for the linked PR, which removes unnecessary register
to register moves when one operand is in %rdx for mulx, the point was
brought up that this pattern also happens for mull and mulq.

The IR below:

```llvm
declare i32 @foo32()
declare i64 @foo64()

define i32 @mul32_no_implicit_copy(i32 %a0) {
  %a1 = call i32 @foo32()
  %a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1)
  %a3 = extractvalue { i32, i1 } %a2, 0
  ret i32 %a3
}

define i64 @mul64_no_implicit_copy(i64 %a0) {
  %a1 = call i64 @foo64()
  %a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1)
  %a3 = extractvalue { i64, i1 } %a2, 0
  ret i64 %a3
}
```

Generates this code on current HEAD:

```asm
mul32_no_implicit_copy:                 # @mul32_no_implicit_copy
        push    rbx
        mov     ebx, edi
        call    foo32 at PLT
        mov     ecx, eax
        mov     eax, ebx
        mul     ecx
        pop     rbx
        ret
mul64_no_implicit_copy:                 # @mul64_no_implicit_copy
        push    rbx
        mov     rbx, rdi
        call    foo64 at PLT
        mov     rcx, rax
        mov     rax, rbx
        mul     rcx
        pop     rbx
        ret
```

Where the register shuffling before the mul is the same pattern as for
mulx in the previous PR.

With this branch it generates this code now:

```asm
mul32_no_implicit_copy:
	pushq	%rbx
	movl	%edi, %ebx
	callq	foo32 at PLT
	mull	%ebx
	popq	%rbx
	retq
mul64_no_implicit_copy:
	pushq	%rbx
	movq	%rdi, %rbx
	callq	foo64 at PLT
	mulq	%rbx
	popq	%rbx
	retq
```

[StructurizeCFG] Fix incorrect zero-cost hoisting in nested control flow (#183792)

hoistZeroCostElseBlockPhiValues() hoists zero-cost instructions from
else blocks to their common dominator with the then block. When the
merge point has additional predecessors beyond the simple if-else
pattern, the hoisted instruction ends up in a dominator that feeds
a Flow phi on every edge, including edges where the else block was
never taken. simplifyHoistedPhis() then replaces poison entries in
those Flow phis with the hoisted value, causing it to leak into
unrelated paths.

This manifests as miscompilation in sorting kernels compiled with
code coverage: the PGO counter blocks create deeply nested CFGs
where the hoisted shufflevector (used for swapping sort keys)
reaches the no-swap path, corrupting sort results.

Fix by requiring a simple if-else CFG shape before hoisting: ThenBB
must branch directly to ElseSucc and ElseSucc must have exactly 2
predecessors. This matches the structure that simplifyHoistedPhis
assumes.

[RISCV] Add more extensions to spacemit-x100 (#186351)

[RISCV][NFC] Move extension test for spacemit-x60 to a separate file (#186357)

[CIR] Add Commutative/Idempotent traits to binary ops (#185163)

Add missing MLIR traits to CIR binary operations:

- AndOp, OrOp: Commutative, Idempotent
- AddOp, MulOp, XorOp, MaxOp: Commutative

Add these ops to the CIRCanonicalize pass op list so trait-based
folding is exercised by applyOpPatternsGreedily.

[clang-tidy] Fix false positive in `readability-else-after-return` on `return` jumped over by `goto` (#186370)

Given this code:

```cpp
if (...) {
  goto skip_over_return;
  return;
skip_over_return:
  foo();
} else {
  ...
}
```

...the check suggests removing the `else`, which is not a valid
transformation. This is because it looks at *all* the substatements of
the then-branch for interrupting statements. This PR changes it to only
look at the *final* substatement.

Technically, this introduces a false negative on code like this:

```cpp
if (...) {
  return;
  dead_code();
} else { // <-- Could in theory remove this 'else'
  ...
}
```

But, that code is objectively bad, so I don't think we're losing
anything.

This change has the side effect of making the check a bit more general;
it now recognizes attributed interrupting statements (e.g.
`[[clang::musttail]] return f();`).

[Transforms/Scalar][NFC] Drop uses of BranchInst (#186592)

I ended up relaxing some of the checks that LoopInterchange made, the
assumptions that certain instructions were branches seemed to not be
used at all.

[LV] Move predication, early exit & region handling to VPlan0 (NFCI) (#185305)

Move handleEarlyExits, predication and region creation to operate
directly on VPlan0. This means they only have to run once, reducing
compile time a bit; the relative order remains unchanged.

Introducing the regions at this point in particular unlocks performing
more transforms once, on the initial VPlan, instead of running them for
each VF.

Whether a scalar epilogue is required is still determined by legacy cost
model, so we need to still account for that in the VF specific VPlan
logic.

PR: https://github.com/llvm/llvm-project/pull/185305

[IPO][InstCombine][Vectorize][NFCI] Drop uses of BranchInst (#186596)

Refactor remaining parts of Transforms apart from Scalar and Utils.

[IR][NFC] Remove BranchInst successor functions (#186604)

The efficient access is now handled by UncondBrInst/CondBrInst,
Instruction functions handle the more generic cases. These functions are
now largely unused now that most uses of BranchInst are gone.

Preliminary work for making the CondBrInst operand order consistent.

[WebAssembly][NFC] Rename and test FastISel selectBr (#186577)

selectBr only handles conditional branches and also wasn't tested.
Clarify the name and add test that enforces that there's no fallback.

[X86] Reject 'p' constraint without 'a' modifier in inline asm (#185799)

The 'p' constraint produces an address operand that should only be
printed with the 'a' modifier (e.g., %a0). Without it, GCC and Clang
produce different and arguably incorrect output

https://github.com/llvm/llvm-project/issues/185343#issuecomment-4029670370
Reject the combination to catch misuse early.

[llvm-mc] Default output assembly variant to AssemblerDialect (#186317)

Previously, llvm-mc always defaulted to output assembly variant 0
regardless of the target's AssemblerDialect. This was inconsistent:
llvm-mc -x86-asm-syntax=intel changed the input parser to Intel syntax
but output stayed AT&T, unlike clang's -masm=intel which affects both.

When --output-asm-variant is not explicitly specified, fall back to
MAI->getAssemblerDialect() instead of hardcoding variant 0. This
makes the output match the target's configured dialect:

- X86: -x86-asm-syntax=intel now produces Intel output
- AArch64: Apple triples default to Apple syntax output
- SystemZ: z/OS triples default to HLASM syntax output

Tests that relied on a specific output variant now use explicit
--output-asm-variant=0.

[lldb] Rename Status variables to avoid confusion (NFC) (#186486)

Rename Status variables that are named `error` to `status` to avoid
confusion with llvm::Error as the latter becomes more and more
prevalent.

[lldb] Skip tests that are incompatible with MTE (#186043)

Skip tests that are incompatible with MTE.

Depends on:
- https://github.com/llvm/llvm-project/pull/185780

[IR] Add Instruction::successors() (#186606)

Nowadays all terminators store all successor operands consecutively, so
we can expose the range of successors through a unified interface.

Rename succ_op_iterator to succ_iterator for consistency, also with
Machine IR.

Preliminary work for replacing the succ_iterator in CFG.h with an
iterator that iterates directly over the uses.

[msan][NFCI] Replace unnecessary shadow cast with assertion (#186498)

Fabian Wolff pointed out that #176031 made the output of CreateIntCast()
unused in handleBitwiseAnd().

Upon closer inspection, the CreateIntCast()s are unnecessary, because the
arguments to handleBitwiseAnd() (and visitOr()) are integers or vectors of
integers, for which the shadow types are the same as the original types.
This patch removes the unnecessary if and shadow cast, and adds
assertions.

[CIR] Add cir.min op and refactor cir.max lowering (#185276)

Add cir.min operation for integer minimum computation. Refactor cir.max
lowering into a shared lowerMinMaxOp template reused by both ops.

[IR] Make BranchInst operand order consistent (#186609)

Ensure that successors are always reported in the same order in which
they are stored in the operand list.

Improved ISD::SRL handling in isKnownToBeAPowerOfTwo (#182562)

Fixes #181651

Added DemandedElts argument to isConstOrConstSplat and to
isKnowTobePowerOfTwo calls and OrZero || isKnownNeverZero(Val, Depth) is
checked before isKnowTobePowerOfTwo. Also added unit tests.

[X86] lowerV4F32Shuffle - prefer INSERTPS over SHUFPS when zeroing upper/lower v2f32 (#186612)

Followup to #186468 - use INSERTPS over SHUFPS if the implicit zeroing doesn't cross the the 64-bit halves

[LLVM] Change IRBuilder::CreateAggregateRet to accept an ArrayRef (#186605)

Change `IRBuilder::CreateAggregateRet()` to accept an `ArrayRef` instead
of a pointer and size, and extend IRBuilder unit test to exercise it.

[PhaseOrdering][X86] Add average round tests based off #128424 (#186615)

[CIR] Remove cir.unary(plus, ...) and emit nothing for unary plus (#185278)

Traditional codegen never emits any operation for unary plus — it just
visits the subexpression as a pure identity at the codegen level. Align
CIRGen with this behavior by removing Plus from UnaryOpKind entirely
and having VisitUnaryPlus directly visit the subexpression with the
appropriate promotion/demotion handling.

[VPlan] Add hasPredecessors and hasSuccessors to VPBlockBase (NFC).

Add/move helpers to VPBlockBase, and use in a few more places.

Split off from https://github.com/llvm/llvm-project/pull/156262 as
suggested.

[clang-format] Fix a crash on fuzzer-generated invalid C++ code (#186566)

Fixes #185421

[VPlan] Consolidate VPRegionBlock constructors (NFC).

Unify VPRegionBlock constructors into a single one, in preparation for
https://github.com/llvm/llvm-project/pull/156262. Split off as
suggested.

[X86] isSplatValueForTargetNode - test source value for vector uniform shift ops (#186619)

For old SSE style vector shifts, we just need to check the shifted value is a splat as the shift amount is uniform

Avoids an unnecessary variable shuffle in i512 ashr expansion

[IR] Implement successors as Use iterators (#186616)

This is possible since now all successor operands are stored
consecutively.

There is just one out-of-line function call instead of one call to
getSuccessor() per operand.

[VPlan] Remove special handling for canonical increment (NFC).

The canonical IV increment should be proven as uniform-across-VF-and-UF
by the existing logic. Remove explicit handling, in preparation for
https://github.com/llvm/llvm-project/pull/156262. Split off as
suggested.

[VPlan] Create zero resume value for CanIV directly (NFC).

The start value of the canonical IV is always 0. Assert and generate
zero VPValue manually in preparation for
https://github.com/llvm/llvm-project/pull/156262. Split off as
suggested.

[Docs] typo settting -> setting (#178665)

[libc++][Android] Update Compiler for Android CI (#186531)

Upgrade Android compiler from r563880 to r584948b because libc++ does
not support LLVM 20 anymore

[clang][Driver][Darwin] Optionally use xcselect to find macOS SDK (#119670)

This is a scaled down version of https://reviews.llvm.org/D136315.

The intent is largely the same as before[^1], but I've scaled down the
scope to try to avoid the issues that the previous patch caused:
- the changes are now opt-in based on enabling `CLANG_USE_XCSELECT`
- this only works when targeting macOS on a macOS host (this is the only
case supported by `libxcselect`[^2])
- calling `libxcselect` is done only when the target is `*-apple-macos*`
to avoid breaking many tests

Another reason to leave this as opt-in for now is that there are some
bugs in libxcselect that need fixing before it is safe to use by default
for all users. This has been reported to Apple as FB16081077.

[^1]: See also https://reviews.llvm.org/D109460 and #45225.
[^2]: https://developer.apple.com/documentation/xcselect?language=objc

[clang-tidy] Add redundant qualified alias check (#180404)

Introduce `readability-redundant-qualified-alias` to flag identity type
aliases that repeat a qualified name and suggest using-declarations when
safe. The check is conservative: it skips macros, elaborated keywords,
dependent types, and templates. `OnlyNamespaceScope` controls whether
local/class scopes are included (default `false`).

Depends on: #183940 #183941

[CIR] Split CIR_UnaryOp into individual operations (#185280)

Split the monolithic cir.unary operation (which dispatched on a
UnaryOpKind enum) into four separate operations: cir.inc, cir.dec,
cir.minus, and cir.not.

Changes:
- Add CIR_UnaryOpInterface with getInput()/getResult() methods
- Add CIR_UnaryOp and CIR_UnaryOpWithOverflowFlag base classes
- Define IncOp, DecOp, MinusOp, NotOp with per-op folds
- Add Involution trait to NotOp for not(not(x)) -> x folding
- Replace createUnaryOp() with createInc/Dec/Minus/Not builders
- Split LLVM lowering into four separate patterns
- Split LoweringPrepare complex-type handling per unary op
- Update CIRCanonicalize and CIRSimplify for new op types
- Update all codegen files to use bool params instead of UnaryOpKind
- Remove CIR_UnaryOpKind enum and old CIR_UnaryOp definition

Assembly format change:
  cir.unary(inc, %x) nsw : !s32i, !s32i  ->  cir.inc nsw %x : !s32i
  cir.unary(not, %x) : !u32i, !u32i      ->  cir.not %x : !u32i

[AggressiveInstCombine] Recognize table based log2 and replace with ctlz+sub. (#185160)

Recognize table based log2 implementations like

```
unsigned log2(unsigned v) {
  static const unsigned char table[] = {
    0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
    8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
  };

  v |= v >> 1;
  v |= v >> 2;
  v |= v >> 4;
  v |= v >> 8;
  v |= v >> 16;

  return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
}
```

and replaces with 31 - llvm.ctlz(v).

Similar for i64 log2. Other sizes can be supported with correct multiply
constant and table values, but I have not found examples yet.

This code is based on the existing tryToRecognizeTableBasedCttz. Like
that function, we support
any combination of multiply constant and table values that produce the
correct result.

It handles the same pattern as #177110, but does not match the outer
subtract from that patch. It is assumed that InstCombine or other
optimizations can combine (sub 31 (sub 31, cttz V)) later.

I have limited this to targets that have a fast ctlz. The backend does
not yet have a table based lowering for ctlz so this reduces the chance
of regressions.

[MLIR][Python] Refine the behavior of Python-defined dialect reloading (#186128)

This includes several changes:
- `Dialect.load(reload=False)` will fail if the dialect was already
loaded in a different context. To prevent the further program abortion.
- `Dialect.load(reload=True)` implies `replace=True` in
dialect/operation registering.
- `PyGlobals::registerDialectImpl` now has a parameter `replace`.
- `register_dialect` and `register_operation` is no longer exposed in
`mlir.dialects.ext`.

This should solve the registering problem found in writing transform
test cases by @rolfmorel.

[libc++][test] Use loop with compare_exchange_weak calls (#185953)

On AIX, this test sometimes fails with error `Assertion failed: y ==
true`. The test assumes `compare_exchange_weak` should succeed on a
single call, however according to the standard:

> A weak compare-and-exchange operation may fail spuriously. That is,
even when the contents of memory referred to by expected and ptr are
equal, it may return false and store back to expected the same memory
contents that were originally there.
This spurious failure enables implementation of compare-and-exchange on
a broader class of machines, e.g., load-locked store-conditional
machines. A consequence of spurious failure is that nearly all uses of
weak compare-and-exchange will be in a loop.

[atomics.ref.ops]/27

[orc-rt] Rename "ResourceManager" to "Service". NFCI. (#186639)

The name "Service" better reflects the general purpose of this class: It
provides *something* (often resource management) to the Session, is
owned by the Session, and receives notifications from the Session when
the controller detaches / is detached, and when the Session is shut
down.

An example of a non-resource-managing Service (to be added in an
upcoming patch) is a detach / shutdown notification service: Clients can
add this service to register arbitrary callbacks to be run on detach /
shutdown. The advantage of this over the current Session detach /
shutdown callback system is that clients can control both the order of
the callbacks, and their order relative to notification of other
services.

[orc-rt] Return ref from Session::addService, add createService. (#186640)

Session::addService now returns a reference to the added Service. This
allows clients to hold a reference for further direct interaction with
the Service object.

This commit also introduces a new Session::createService convenience
method that creates the service and returns a reference to it.

[mlir] Fix op comparisons in extensible dialects (#186637)

The extensible dialect system defined `compareProperties` to false
because it doesn't use properties. However, this should have been
`true`, as the empty properties are trivially always equal to
themselves. Doing otherwise means that no operations in extensible
dialects that aren't the exact same operation will ever compare equal
for the purposes of operations like CSE.

[clang-format] Upgrade ShortFunctionStyle to a struct (#134337)

The current clang-format configuration
option AllowShortFunctionsOnASingleLine uses a single enum
(ShortFunctionStyle) to control when short function definitions can be
merged onto a single line. This enum provides predefined combinations of
conditions
(e.g., None, Empty only, Inline only, Inline including Empty, All).

This approach has limitations:

1. **Lack of Granularity:** Users cannot specify arbitrary combinations
of conditions. For example, a user might want to allow merging
for both empty functions and short top-level functions, but not for
short functions defined within classes. This is not possible with the
current enum options except by choosing All, which might merge more than
desired.

2. **Inflexibility:** Adding new conditions for merging (e.g.,
distinguishing between member functions and constructors, handling
lambdas specifically) would require adding many new combined enum
values, leading to a combinatorial explosion and making the
configuration complex.

3. **Implicit Behavior:** Some options imply others
(e.g., Inline implies Empty), which might not always be intuitive or
desired.

The goal is to replace this single-choice enum with a more flexible
mechanism allowing users to specify a set of conditions that must be met
for a short function to be merged onto a single line.

---------

Co-authored-by: owenca <owenpiano at gmail.com>

[clang][bytecode] Remove unused members from `EvalEmitter` (#186601)

Remove the DenseMap handling lambda paramter mappings from
`EvalEmitter`. This was always unused. Remove it and use `if constexpr`
to keep things compiling.

[CMake] Disable PCH reuse for plugins in non-PIC builds (#186643)

Plugins are always PIC and therefore cannot reuse non-PIC PCH.

[Analysis][NFC] Move BranchProbabilityInfo constr to cpp (#186648)

The implementation details of the analysis are irrelevant for users,
therefore move these to the .cpp file.

[clang-format] Add option AllowShortRecordOnASingleLine (#154580)

This patch supersedes PR #151970 by adding the option
``AllowShortRecordOnASingleLine`` that allows the following formatting:
```c++
  struct foo {};
  struct bar { int i; };
  struct baz
  {
    int i;
    int j;
    int k;
  };
```

---------

Co-authored-by: owenca <owenpiano at gmail.com>

[clang][ssaf][NFC] Prefix ssaf-{linker,format} dirs with 'clang-' (#186610)

Addresses:
https://github.com/llvm/llvm-project/pull/185631#issuecomment-4054586633

[X86] Add missing VPSRAQ broadcast-from-mem patterns for non-VLX targets (#186654)

[clang-tidy] Adds do-while support to performance-inefficient-string-concatenation (#186607)

Closes #186362

---------

Co-authored-by: Victor Chernyakin <chernyakin.victor.j at outlook.com>
Co-authored-by: EugeneZelenko <eugene.zelenko at gmail.com>

[X86] known-never-zero.ll - add vector test coverage for #186335 (#186660)

Support float8_e3m4 and float8_e4m3 in np_to_memref (#186453)

This patch adds support for `float8_e3m4` and `float8_e4m3` in
`np_to_memref.py` by adding the appropriate ctypes structures

[Transforms/Utils][NFC] Replace SmallPtrSet with vector (#186664)

Typically most blocks in a function are reachable, so use a vector
indexed by block number instead of a SmallPtrSet.

[SimplifyCFG][NFC] Renumber blocks when changing func (#186666)

Keep numbering dense when changing the function. SimplifyCFG is a good
candidate, because it is likely to remove blocks and preserves few
analyses.

[CFG][InstCombine][NFC] Use block numbers when finding backedges (#186668)

The functions traverse all basic blocks, so SmallPtrSets use a single
vector indexed by block number.

[CodeGenPrepare][NFC] Get BPI/BFI from pass/analysis manager (#186651)

BranchProbabilityInfo will compute it's own dominator tree and
post-dominator tree if none is specified; avoid this by using the
analysis manager/pass manager to get the analysis, which will reuse the
previously computed DomTree.

[X86] combineConcatVectorOps - concat(vtruncs(x),vtruncs(y)) -> packss(shuffle(x,y),shuffle(x,y)) (#186678)

Although at worst this isn't a reduction in instruction count, the shuffle/packss sequence is much easier for further folds / shuffle combining

Revert "[CI] Try lowering max parallel link jobs on Windows (#185255)"

This reverts commit af22b50fac2311ff3f859e4e8bdec552c7aa8d5a.

This seems to have had no noticeable effect on the frequency of failures
so likely was not the issue.

Revert "Support float8_e3m4 and float8_e4m3 in np_to_memref (#186453)" (#186677)

This reverts commit 57427f84fe5fdda71aef4be257ed28d7b4f55d05.

For some reason mlir-nvidia CI is failing to import `float8_e3m4` from
`ml_dtypes`. See
https://lab.llvm.org/buildbot/#/builders/138/builds/27095.

[X86] combineConcatVectorOps - concat(vtruncus(smax(x,0)),vtruncus(smax(y,0))) -> packus(shuffle(x,y),shuffle(x,y)) (#186681)

Followup to vtruncs/packss handling

Update GitHub Artifact Actions (major) (#184052)

This PR contains the following updates:

| Package | Type | Update | Change |
|---|---|---|---|
|
[actions/download-artifact](https://redirect.github.com/actions/download-artifact)
| action | major | `v7.0.0` → `v8.0.1` |
|
[actions/upload-artifact](https://redirect.github.com/actions/upload-artifact)
| action | major | `v6.0.0` → `v7.0.0` |
|
[actions/upload-artifact](https://redirect.github.com/actions/upload-artifact)
| action | major | `6.0.0` → `7.0.0` |

[BPF] Use ".L" local prefix label for basic blocks (#95103)

Previously, PrivateLabelPrefix was default-initialized to "L", so basic
block labels were added to the symbol table. This seems like an
oversight, so use ".L" for all private labels.

[clang-tidy][NFC] Use universal type_traits mock (#186652)

[Utils] Format git-llvm-push

Use single quotes for string arguments inside f-strings or otherwise the
version of black that we use fails to parse. Also reformat the file
given that hasn't been working for a while (wholesale or incrementally)
to the above issue.

[clang][doc] Improve error handling for `LibTooling` example code avoiding core dump (#98129)

Resolves #97983

[Clang][Docs] Clarify [[unlikely]] example in compound statement (#186590)

The first code example in the "confusing standard behavior" section
had a comment claiming `[[unlikely]]` makes the branch unlikely,
contradicting a later example showing the same placement being ignored.

Rewords the comment to clarify this is the C++ Standard's
recommendation that Clang does not follow, since the attribute is not on
the substatement.

Continues the work from #126372.

Fixes #126362.

[libc][Github] Bump clang in libc container to v23 (#186697)

Back to HEAD now that apt.llvm.org is working again for ToT.

[gn] port 629edaf67844c01db37 (CLANG_USE_XCSELECT)

[gn] port f002fc0ee8734283

[IR] Don't allow successors() over block without terminators (#186646)

There's no point constructing a dominator tree or similar on
known-broken IR. Generally, functions should be able to assume that IR
is valid (i.e., passes the verifier). Users of this "feature" were:

- Verifier, fixed by verifying existence of terminators first.
- FuzzMutate, worked around by temporarily inserting terminators.
- OpenMP to run analyses while building the IR, worked around by
temporarily inserting terminators.
- Polly to work with an empty dominator tree, fixed by temporarily
adding an unreachable inst.
- MergeBlockIntoPredecessor, inadvertently, fixed by adding terminator
before updating MemorySSA.
- Some sloppily written unit tests.

[IR] Add initial support for the byte type (#178666)

Following the [byte type RFC](https://discourse.llvm.org/t/rfc-add-a-new-byte-type-to-llvm-ir/89522)
and the discussions within the [LLVM IR Formal Specification WG](https://discourse.llvm.org/t/rfc-forming-a-working-group-on-formal-specification-for-llvm/89056), this PR introduces initial support for the byte type in LLVM. This PR:
- Adds the byte type to LLVM's type system
- Extends the `bitcast` instruction to accept the byte operands
- Adds parsing tests for all new functionality
- Fixes failing regressions tests (IR2Vec and IRNormalizer)

---------

Co-authored-by: George Mitenkov <georgemitenk0v at gmail.com>

[orc-rt] Don't return Error in Service::OnComplete. (#186708)

The Session can't do anything useful with these errors, it can only
report them. It's cleaner if the Service objects just report the error
directly.

[clang-tidy][NFC] Use universal memory mock for smart ptrs (#186649)

[orc-rt] Fix unittests after 53a1e056f38. (#186711)

Updates unittests to reflect Service interface changes.

Revert "[IR] Add initial support for the byte type" (#186713)

Reverts llvm/llvm-project#178666 to unblock CI.
`CodeGen/X86/byte-constants.ll` is at fault.
Will look into it and hopefully fix it by tomorrow.

[NFC] Delete `MCPseudoProbeDecoder`'s move constructor (#186698)

`MCPseudoProbeDecoder` cannot be copeied/moved due to its address
dependence on the DummyInlineRoot member address. Explicitly delete the move constructor.

[RISCV] Add `sifive-x160` and `sifive-x180` processor definitions (#186264)

This PR adds new processor definitions for two SiFive cores:
- X160
(https://www.sifive.com/document-file/sifive-intelligence-x160-gen2-product-brief):
A RV32 core with Zve32f
- X180
(https://www.sifive.com/document-file/sifive-intelligence-x180-gen2-product-brief):
A RVV-capable RV64 core

Both of them have VLEN=128.

Scheduling model supports will be added in follow-up patches.

[orc-rt] Add a simple iterator_range class. (#186720)

This will be used to simplify operations on iterator ranges in the ORC
runtime.

[LoongArch] Remove unreachable Value check in fixupLeb128 (#186297)

Value is guaranteed to be zero after the loop:

  for (I = 0; Value; ++I, Value >>= 7)

Therefore the subsequent `if (Value)` condition is always false.
Remove the unreachable code. Reported by PVS-Studio.

Fixed: #170122

[lld][ELF] Fix crash when relaxation pass encounters synthetic sections

In LoongArch and RISC-V, the relaxation pass iterates over input sections
within executable output sections. When a linker script places a synthetic
section (e.g., .got) into such an output section, the linker would crash
because synthetic sections do not have the relaxAux field initialized.

The relaxAux data structure is only allocated for non-synthetic sections
in initSymbolAnchors. This patch adds the necessary null checks in the
relaxation loops (relaxOnce and finalizeRelax) to skip sections that
do not require relaxation.

A null check is also added to elf::initSymbolAnchors to ensure the
subsequent sorting of anchors is safe.

Fixes: #184757

Reviewers: MaskRay

Pull Request: https://github.com/llvm/llvm-project/pull/184758

[clang] Skip dllexport of inherited constructors with unsatisfied constraints (#186497)

When a class is marked `__declspec(dllexport)`, Clang eagerly creates
inherited constructors via `findInheritingConstructor` and propagates
the dllexport attribute to all members. This bypasses overload
resolution, which would normally filter out constructors whose requires
clause is not satisfied. As a result, Clang attempted to instantiate
constructor bodies that should never be available, causing spurious
compilation errors.

Add constraint satisfaction checks in `checkClassLevelDLLAttribute` to
match MSVC behavior:

1. Before eagerly creating inherited constructors, verify that the base
constructor's `requires` clause is satisfied. Skip creation otherwise.

2. Before applying dllexport to non-inherited methods of class template
specializations, verify constraint satisfaction. This handles the case
where `dllexport` propagates to a base template specialization whose own
members have unsatisfied constraints.

Inherited constructors skip the second check since their constraints
were already verified at creation time.

Fixes #185924

Followup to https://github.com/llvm/llvm-project/pull/182706

Assisted by: Cursor // Claude Opus 4.6

[orc-rt] Add LockedAccess utility. (#186737)

LockedAccess provides pointer-like access to a value while holding a
lock. All accessors are rvalue-ref-qualified, restricting usage to
temporaries to prevent accidental lock lifetime extension. A with_ref
method is provided for multi-statement critical sections.

[CIR] Add Pure trait to IsFPClassOp (#186625)

IsFPClassOp is a pure classification check on a floating-point value
with no memory effects.

[clangd] Report reference to UsingType's target decl at the correct location (#186310)

Fixes https://github.com/clangd/clangd/issues/2617

[SelectionDAG] Add CTTZ_ELTS[_ZERO_POISON] nodes. NFCI (#185600)

Currently llvm.experimental.cttz.elts are directly lowered from the
intrinsic.

If the type isn't legal then the target tells SelectionDAGBuilder to
expand it into a reduction, but this means we can't split the operation.
E.g. it's possible to split a cttz.elts nxv32i1 into two nxv16i1,
instead of expanding it into a nxv32i64 reduction.

vp.cttz.elts can be split because it has a dedicated SelectionDAG node.

This adds CTTZ_ELTS and CTTZ_ELTS[_ZERO_POISON] nodes and just enough
legalization to get tests passing. A follow up patch will add splitting
and move the expansion into LegalizeDAG.

[mlir][linalg] Use inferConvolutionDims for generic convolution downscaling (#180586)

The goal of this PR is to implement a generic, structure-aware
convolution downscaling transformation that works for any
convolution-like operation regardless of its specific layout or naming,
rather than relying on pattern-matching against specific named
operations.

Each pattern we currently have, have hardcoded dimension indices
specific to its layout (e.g., NHWC vs NCHW).
This approach :-
1. Requires maintaining many similar patterns.
2. Is brittle when new layouts are introduced.
3. Cannot handle batchless versions of the conv variants.

This PR thus creates a single downscaleSizeOneWindowedConvolution
function that uses `inferConvolutionDims` to semantically understand the
convolution structure (batch dims, output image dims, filter loop dims,
etc.) rather than hardcoding indices.
It works with any layout - NHWC, NCHW, or any other - because it reasons
about the meaning of dimensions, not their positions.

If the input to the downscaling pattern is a named op -> the output will
be a named op. Else it'd be a generic op input/output.
And for this reason we now remove the second RUN line as the infra tests
both named as well as generic ops.

Signed-off-by: Abhishek Varma <abhvarma at amd.com>

[clang-tidy] Fix an edge case in readability-implicit-bool-conversion (#186234)

Fix a FP for condition expressions wrapped by `ExprWithCleanups`.

Co-authored-by: EugeneZelenko <eugene.zelenko at gmail.com>
Co-authored-by: Zeyi Xu <zeyi2 at nekoarch.cc>

[X86][APX] Combine MOVABS+JMP to JMPABS when in no-PIC large code model (#186402)

[CodeGen] Call getMCPU once instead of commonly twice (NFC) (#186581)

[ARM] Try to lower sign bit SELECT_CC to shift (#186349)

Lower a `x < 0 ? 1 : 0` style SELECT_CC to `x>>(bw-1)`. This will become
more important with an upcoming change, but also appears to be somewhat
useful by itself.

[C++20] [Modules] Don't add discardable variables to module initializers (#186752)

Close https://github.com/llvm/llvm-project/issues/170099

The root cause of the problem is, we shouldn't add the inline variable
(which is discardable in linker's point of view) to the module's
initializers.

I verified with GCC's generated code to make the behavior consistent.

This is also a small optimization by the way.

[LV] Add more tests for blend masks. NFC (#186751)

To be used in #184838

[LangRef] Fix typo in signatures for rounding intrinsics (#186709)

Fixes #186536

[lldb-dap] Mark return value as readonly (#186329)

Marked return value as readonly to give VS Code a hint that this
variable doesn't support `setVariable` request.

[orc-rt] Add Controller Interface (CI) symbol table to Session. (#186747)

The Controller Interface is the extended set of symbols (mostly wrapper
functions) that the controller can call prior to loading any JIT'd code.
It is expected that it will be used to inspect the process and create /
configure services to enable JITing.

[AArch64] Add extra test coverage to legalize-shuffle-1x.ll. NFC

[AMDGPU] Initialize more fields in the SIInsertWaitcnts constructor. NFC. (#186394)

ST, TII, TRI and MRI can all be initialized in the constructor and hence
be references instead of pointers.

[AVR] Optimize expansion of pseudo instruction SPWRITE for no SPH devices (#152905)

fixes https://github.com/llvm/llvm-project/issues/148560

[AMDGPU] Simplify state clearing in SIInsertWaitcnts. NFC. (#186399)

There is no need to clear state at the start or end of the run method,
because a fresh instance of SIInsertWaitcnts is constructed for each run
on a MachineFunction.

[flang][NFC] Converted five tests from old lowering to new lowering (part 31) (#186299)

Tests converted from test/Lower/Intrinsics: iall.f90, iand.f90,
iany.f90, ibclr.f90, ibits.f90

[libc++] Avoid including <cmath> in <format> (#186332)

This reduces the time to parse `<format>` a bit.

[X86] Blocklist instructions that are unsafe for masked-load folding. (#178888)

This PR blocklist instructions that are unsafe for masked-load folding.

Folding with the same mask is only safe if every active destination
element reads only from source elements that are also active under the
same mask. These instructions perform element rearrangement or
broadcasting, which may cause active destination elements to read from
masked-off source elements.

VPERMILPD and VPERMILPS are safe only in the rrk form, the rik form
needs to be blocklisted. In the rrk form, the masked source operand is a
control mask, while in the rik form the masked source operand is the
data/value. This is also why VPSHUFB is safe to fold, while other
shuffles such as VSHUFPS are not.

Examples:
```
EVEX.128.66.0F.WIG 67 /r VPACKUSWB xmm1{k1}{z}, xmm2, xmm3/m128
A: 00010203 7F000001 80000002 DEADBEEF
E : 00000000 00000001 00000002 00000003
D: 11111111 22222222 33333333 44444444
k = 0x0400
Masked_e = 00000000 00000000 00000000 00000000 (vmovdqu8{k}{z} Masked_e E)
res1 = 00000000 00000000 00010000 00000000   (VPACKUSWB D{k}{z}, A, E)
res2 =  00000000 00000000 00000000 00000000 (VPACKUSWB D{k}{z}, A, Masked_e)

EVEX.128.66.0F38.W0 C4 /r VPCONFLICTD xmm1 {k1}{z}, xmm2/m128/m32bcst
A: DAA66D2B FFFFFFFC FFFFFFFC D9A0643C
E : 7DDF743F 00000000 5FD99E73 4ED634C9
D: 2629AB38 9E37782F 67BB800F AD66764A
k = 0x0002
Masked_e = (vmovdqu32 {k}{z} Masked_e E)
res1 = 00000000 00000000 00000000 00000000 (VPCONFLICTD D{k}{z}, E)
res2 = 00000000 00000001 00000000 00000000  (VPCONFLICTD D{k}{z}, Masked_e)

EVEX.128.66.0F38.W1 8D /r VPERMW xmm1 {k1}{z}, xmm2, xmm3/m128
A: 00010203 7F000001 80000002 DEADBEEF
E : 00000000 00000001 00000002 00000003
D: 11111111 22222222 33333333 44444444
k = 0x0010
Masked_e = 00000000 00000000 00000002 00000000 (vmovdqu16 {k}{z} Masked_e E)
res1 = 00000000 00000000 00000001 00000000 (vpermw D{k}{z}, A, E)
res2 =  00000000 00000000 00000000 00000000  (vpermw D{k}{z}, A, Masked_e)

EVEX.128.66.0F38.W0 78 /r VPBROADCASTB xmm1{k1}{z}, xmm2/m8
E : 7F4A7C15 6E490933 5D4C9659 4C433CE3
D: F63F9D36 97F6E2B2 9432E8E6 FAEE7A3E
k = 0x0002
Masked_e = 00007C00 00000000 00000000 00000000 (vmovdqu8{k}{z} Masked_e E)
res =  00001500 00000000 00000000 00000000 (vpbroadcastb D{k}{z}, E)
res =  00000000 00000000 00000000 00000000 (vpbroadcastb D{k}{z}, Masked_e)
```

Baseline: https://github.com/llvm/llvm-project/pull/178411

[flang][OpenMP] Implement nest depth calculation in LoopSequence (#186477)

Calculate two depths, a semantic one and a perfect one. The former is
the depth of a loop nest taking into account any loop- or
sequence-transforming OpenMP constructs. The latter is the maximum level
to which the semantic nest is a perfect nest.

Issue: https://github.com/llvm/llvm-project/issues/185287

Reinstate PR185298 after a fix has been merged in PR186416. Includes a
testcase that triggered failures before.

[clang][bytecode] Remove FunctionPointer class (#186757)

It's been mostly living inside `Pointer` for a long time now, so remove
the leftovers.

[SPIR-V] Address comments on SPV_INTEL_masked_gather_scatter extension implementation (#186336)

Address comments left after merge of #185418

[libc] Fix build failures in fuzzing tests (#185017)

The tests:
 - __support/freelist_heap_fuzz.cpp
 - fuzzing/string/strlen_fuzz.cpp

had build failures for different reasons. This patch fixes these
failures.

freelist_heap_fuzz.cpp had this error:

```
llvm-project/libc/fuzzing/__support/freelist_heap_fuzz.cpp:150:26: error: use of undeclared identifier 'Block'; did you mean '__llvm_libc_23_0_0_git::Block'?
  150 |       size_t alignment = Block::MIN_ALIGN;
      |                          ^~~~~
      |                          __llvm_libc_23_0_0_git::Block
```

The issue stems from the fact that Block was not available in scope. It
needs to be referenced via LIBC_NAMESPACE.

strlen_fuzz.cpp had this error:

```
In file included from Workspace/llvm-project/libc/fuzzing/string/strlen_fuzz.cpp:14:
In file included from /usr/lib/gcc/x86_64-linux-gnu/13/../../../../include/c++/13/cstdint:38:
In file included from /usr/lib/gcc/x86_64-linux-gnu/13/../../../../include/x86_64-linux-gnu/c++/13/bits/c++config.h:679:
/usr/lib/gcc/x86_64-linux-gnu/13/../../../../include/x86_64-linux-gnu/c++/13/bits/os_defines.h:44:5: error: function-like macro '__GLIBC_PREREQ' is not defined
   44 | #if __GLIBC_PREREQ(2,15) && defined(_GNU_SOURCE)

```

This issue is more cryptic to me, but I managed to fix it by changing
the includes from cstdint and cstring to stdint.h and string.h.

[LifetimeSafety] Extract Sema helper implementation to separate header (#186492)

Improves code organization by separating lifetime safety Sema-specific
functionality into its own header file.

[clang][AArch64] Update label in test (nfc) (#186759)

[clang-tidy] Fix performance-use-std-move when moving a forward decl (#186704)

This fixes running clang-tidy on top-of-tree with that check on.

[clang][bytecode][NFC] Pre-commit a test case (#186773)

Make sure we get the `expand()` during `computeOffsetForComparison()`
right.

[Analysis][NFC] Use block numbers for BranchProbabilityInfo (#186658)

Instead of a hash map mapping pairs of blocks and successor index to the
probability, store the probabilities as flat array and start indices
into this array in a per-block information vector.

Also drop value handles: no stored pointers => no stale pointers. If a
block is removed, the block number is not reused unless the function is
renumbered, and BPI doesn't support renumbering.

[WebAssembly] Lower wide vector shifts by constant to extmul pairs (#184007)

Wide vector multiplications by power-of-2 constants were
canonicalized to v8i32 shl nodes. Generic legalizers then split these
into separate 128-bit extend and shift operations, bypassing
WebAssembly's native extended multiplication patterns.

Before:
    mul v8i32:t1, <4096, ...>
    => shl v8i32:t1, <12, ...>
    => split into independent 128-bit extend + shift sequences

WebAssembly SIMD has no native wide vector shifts, but it does
support 128-bit extended multiplications. Lowering these nodes
directly to extmul_low/extmul_high pairs keeps them in native 128-bit
form and improves DAG matching.

After:
mul v8i32:t1, <4096, ...>
    => concat_vectors (extmul_low  t1, c), (extmul_high t1, c)

This preserves the original vector width while utilizing the native
128-bit SIMD pipeline.

Fixed: https://github.com/llvm/llvm-project/issues/179143

[LSR] Remove unnecessary WidestFixupType (NFC) (#185013)

The purpose of WidestFixupType is to prevent FindUseWithSimilarFormula
from matching a formula with different widest fixup type, but this never
happens:
* FindUseWithSimilarFormula is only called by
NarrowSearchSpaceByCollapsingUnrolledCode
* That function only considers Address and ICmpZero kinds, as they're
the only ones that allow a nonzero BaseOffset
 * In an Address use all fixups have pointer type
 * FindUseWithSimilarFormula already excludes ICmpZero uses

[AMDGPU] Make WaitcntBrackets::Limits a reference. NFC. (#186782)

Reland [VPlan] Extend interleave-group-narrowing to WidenCast (#186454)

The patch was intially landed as bd5f9384, but then reverted due to an
underlying issue in narrowInterleaveGroups, described in #185860. The
issue has since been fixed. The reland is simply a conflict-resolved
version of the original patch, which includes an additonal test update.

WidenCast is very similar to Widen recipes.

Fixes #128062.

[IR] Drop BasicBlockEdge::isSingleEdge (#186767)

This was only called on CondBr instructions, where it is always faster
to access the successors directly than to use successors().

Multi-edges don't dominate anything, so this rare case is often already
handled by dominates().

There is also a very small (hardly measurable) performance
improvement here (it did show up in profiles at 0.03% or so).

[C2y] Update the C Status Page from the recent meetings (#186487)

The Feb and Mar 2026 virtual meetings are now concluded, these are the
adopted papers which could potentially impact the compiler.

[libclc] Add generic clc_mem_fence instruction (#185889)

Summary:
This can be made generic, which works as expected on NVPTX and SPIR-V.
We do not replace this for AMDGPU because the dedicated built-in has an
extra argument that controls whether or not local memory or global
memory will be invalidated. It would be correct to use this generic
operation there, but we'd lose that minor optimization so we likely
should not regress.

[NFC][analyzer] Eliminate NodeBuilder::getContext() (#186201)

This is a step towards the removal of the type `NodeBuilderContext`.

The few remaining locations that used `NodeBuilder::getContext()` were
changed to use the methods `getCurrBlock()` and `getNumVisitedCurrent()`
of `ExprEngine`.

The new code is equivalent to the old one because the `NodeBuilder`s
were constructed with `ExprEngine::currBldrCtx` as their context, which
is currently the "backend" behind `getCurrBlock()` and
`getNumVisitedCurrent()` -- but these methods will remain valid after
the removal of `NodeBuilderContext` and `currBldrCtx`.

[libc][Github] Bump libc-fullbuild-tests.yml to clang 23 (#186699)

Do this now that it is available in the container.

[LifetimeSafety] Add user documentation (#183058)

[LLVM][CodeGen][SVE] insert_subvector(undef, splat(C), 0) -> splat(C). (#186090)

When converting a fixed-length constant splats to scalable vector we can
instead regenerate the splat using the target type.

[ADT] Add `Repeated<T>` for memory-efficient repeated-value ranges (#186721)

Introduce a lightweight range representing N copies of the same value
without materializing a dynamic array. The range owns this value.

I plan to use it with MLIR APIs that often end up requiring N copies of
the same thing. Currently, we use `SmallVector<T>(N, Val)` for these,
which is wasteful.

---------

Co-authored-by: Claude Opus 4.6 <noreply at anthropic.com>

[NFC][analyzer] Refactor ExprEngine::processCallExit (#186182)

This commit converts `ExprEngine::processCallExit` to the new paradigm
introduced in 1c424bfb03d6dd4b994a0d549e1f3e23852f1e16 where the current
`LocationContext` and `Block` is populated near the beginning of the
`dispatchWorkItem` call (= elementary analysis step) and remains
available during the whole step.

Unfortunately the first half of the `CallExit` procedure (`removeDead`)
happens within the callee context, while the second half (`PostCall` and
similar callbacks) happen in the caller context -- so I need to change
the current `LocationContext` and `Block` at the middle of this big
method.

This means that I need to discard my invariant that
`setCurrLocationContextAndBlock` is only called once per each
`dispatchWorkItem`; but I think this exceptional case (first half in
callee, second half in caller) is still clear enough.

In addition to this main goal, I perform many small changes to clarify
and modernize the code of this old method.

[IR][NFC] Hot-cold splitting in PatternMatch (#186777)

ConstantAggregates are rare, therefore split that check into a separate
function so that the fast path can be inlined.

Likewise for vectors, which occur much less frequently than scalar
values.

[AArch64] Add partial reduce patterns for new sve dot variants (#184649)

This patch enables generation of new dot instruction added in 2025 arm
extension from partial reduce nodes.

Update docker/login-action action to v4 (#186719)

This PR contains the following updates:

| Package | Type | Update | Change |
|---|---|---|---|
| [docker/login-action](https://redirect.github.com/docker/login-action)
| action | major | `v3.6.0` → `v4.0.0` |

AMDGPU: Don't limit VGPR usage based on occupancy in dVGPR mode (#185981)

The maximum VGPR usage of a shader is limited based on the target
occupancy,
ensuring that the targeted number of waves actually fit onto a CU/WGP.

However, in dynamic VGPR mode, we should not do that, because VGPRs are
allocated
dynamically at runtime, and there are no static constraints based on
occupancy.
Fix that in this patch.

Also fixup the getMinNumVGPRs helper to behave consistently by always
returning
zero in dVGPR mode.
This also fixes a problem where AMDGPUAsmPrinter bumps the VGPR usage to
at least
the result of getMinNumVGPRs, per my understanding in order to avoid an
occupancy
that is higher than the occupancy target. That was causing incorrect
(too high)
VGPR usages in dVGPR mode with medium-sized workgroups (say 768).

[VPlan] Assert CanIV is the first header phi, drop begin (NFC).

Split off as suggested in https://github.com/llvm/llvm-project/pull/156262/.

[DWARFVerifier] Fix infinite loop in verifyDebugInfoCallSite (#186413)

When attempting to find the callsite for a DwarfDie to see if it was
valid or not, there was a while loop that incorrectly attempted to walk
up the Die parent hierarch. It set `curr` to parent, but then `curr` was
set to same original parent instead of curr.getParent(). This caused
infinite recursion on validation of some kernel binaries by
llvm-dwarfdump where DW_TAG_call_site was nested inside a
DW_TAG_lexical_block (or any non-subprogram, non-inlined_subroutine
tag).

Fix by changing Die.getParent() to Curr.getParent() so the loop
correctly walks up the DIE tree.
Add a new test that validates this scenario. Without this change, that
test hangs rather than succeeding.

[IR][NFC] Inline CmpInst::isSigned/isUnsigned (#186791)

These are small helper functions that are called somewhat often, so
inlining is beneficial.

A very minor improvement. Nonetheless, these two functions are
called somewhat regularly and compile to three instructions each,
so it is always beneficial to inline them.

[Utils] Modernize type annotations in git-llvm-push

Import annotations from __future__ so we can start using more modern
annotations now rather than once we move to Python 3.10 while still
preserving Python 3.8 compatibility. Also fix a couple typing issues
while here.

Reviewers: ilovepi, petrhosek

Pull Request: https://github.com/llvm/llvm-project/pull/186690

[CodeGen] Fix C++ global dtor for non-zero program AS targets (#186484)

In codegen for C++ global destructors, we pass a pointer to the
destructor to be called at program exit as the first arg to the
`__cxa_atexit` function.

If the target's default program AS and default AS are not equal, we need
to emit an addrspacecast from the program AS to the generic AS (which is
used as the argument type for the first arg of `__cxa_atexit`) in the
function call.

---------

Signed-off-by: Nick Sarnie <nick.sarnie at intel.com>

[NFC][LLVM] Fix indentation issue in AArch64ExpandPseudo::expandMI (#186375)

[lldb][NativePDB] Compile `vbases.test` without default libraries (#186510)

`--target=x86_64-windows-msvc`. This will cause the final executable to
be linked to `libcmt.lib`. That doesn't work on ARM, so this PR changes
the command line to link without the default libraries. They're not
needed if we disable `/GS` (buffer security check) like in other tests.

We use `%clang_cl` over `%build` to be able to compile with DWARF as
well.

[lit] Stop holding subprocess objects open in TimeoutHelper (#186712)

Tweak TestRunner's TimeoutHelper storage to hold only PIDs rather
than the whole process object. Holding the object causes many pipes to
stay open, when all we need is the pid.

Addresses #185941

[SPIR-V] Fix llvm.spv.gep return type for vector-indexed GEPs (#185931)

The `int_spv_gep` intrinsic was defined with `llvm_anyptr_ty` which
forced it to return a scalar pointer. Change the return type to
`llvm_any_ty` to allow the intrinsic to match the actual result type of
the original GEP, whether scalar or vector

[Flang][OpenMP] Provide option to use heap allocation for private adjustable arrays (#186795)

The size of adjustable Fortran arrays is not known at compilation time.
Using limited GPU stack memory may cause hard-to-debug errors. On the
other hand, switching to heap memory allocation may lead to missed
optimization opportunities and significantly increased kernel execution
time.

Adding the option `-mmlir --enable-gpu-heap-alloc` allows the user to
generate valid code for adjustable Fortran arrays. The flag is off by
default, so there is no efficiency penalty for code that does not use
adjustable arrays.

[libc] Fix llvm-gpu-loader passing uninitialized device memory (#186804)

Summary:
The return value was not zeroed, this was accidentally dropped when we
did the port and it's zero "almost always" so I didn't notice. Hopefully
this makes the test suite no longer flaky.

[mlir][linalg][elementwise] Fold broadcast into new elementwise (#167626)

Fold broadcast into new elementwise Op which has affine-map attached.
Merging on behalf of @someoneinjd

[DomTree] Assert non-null block for pre-dom tree (#186790)

In a pre-dominator tree, blocks should never be null.

[mlir][llvmir][OpenMP] Translate affinity clause in task construct to llvmir (#182223)

Translate affinity entries to LLVMIR by passing affinity information to
createTask (__kmpc_omp_reg_task_with_affinity is created inside
PostOutlineCB).

3/3 in stack for implementing affinity clause with iterator modifier
1/3 #182218
2/3 #182222
3/3 #182223

[lldb][Module] Remove feedback_stream parameter from LoadScriptingResources (#186787)

I'm in the process of making `LoadScriptingResources` interactively ask
a user whether to load a script. I'd like to turn the existing warning
into the prompt. The simplest way to achieve this is to not print into a
`feedback_stream` parameter, and instead create a prompt right there.
This patch removes the `feedback_stream` parameter and emits a
`ReportWarning` instead. If we get around to adding the prompt instead
of the warning, those changes will be simpler to review. But even if we
don't end up replacing the warning with a prompt, moving away from
output parameters and towards more structured error reporting is a
nice-to-have (e.g., the `warning` prefix is now colored, IDEs have more
flexibility on how to present the warning, etc.).

For a command-line user nothing should change with this patch (apart
from `warning:` being highlighted).

 [PowerPC] Use lxvp/stxvp for mcpu=future v256i1 types (#184447)

For `-mcpu=future`, add patterns to use paired vector instructions
(lxvp/lxvpx/stxvp/stxvpx)
for v256i1 operations instead of splitting into two separate vector
operations.

Assistend by AI.

[VPlan] Simplify&clarify skipping VPValues in calculateRegisterUse (NFC)

Split off as suggested in https://github.com/llvm/llvm-project/pull/156262/.

This refactors the code to clarify comments and code, in preparation for #156262.

[OpenMP][AMDGPU] Enable omptest build (#161649)

This enables building the omptest library across the AMD buildbots that
rely on this CMake cache.

[flang][NFC] Converted five tests from old lowering to new lowering (part 32) (#186730)

Tests converted from test/Lower/Intrinsics: ibset.f90, ichar.f90,
ieee_class.f90, ieee_copy_sign.f90, ieee_is_finite.f90

[SLP]Fix legality checks for bswap-based transformations

Fix the checks for the non-power-of-2 base bswaps by checking the
power-of-2 of the source type, not the target scalar type. Plus, add
cost estimation for zext, if the source type does not match the scalar type.

Fixes https://github.com/llvm/llvm-project/pull/184018#issuecomment-4053477562

[VPlan] Check isa<VPRecipeValue> directly, remove unused variable (NFC).

[MLIR][Presburger] Add support for Smith normal form (#185328)

FPL already has support for computing Hermite normal form for integer
matrices. Here we add support to computing Smith normal form.

This is a preparation for Barvinok's algorithm. Given a polyhedron $P =
\{ x | Ax + b = 0, Cx + d \leq 0 \}$, we must find a particular solution
$x_0$ of $Ax + b = 0$ in order to project lower-dimensional polyhedra
into full-dimensional ones. This requires the Smith normal form of the
integer matrix $A$.

The implementation here follows the algorithm in
[wikipedia](https://en.wikipedia.org/wiki/Smith_normal_form#Algorithm).

AMDGPU/GlobalISel: RegBankLegalize rules for s_barrier/wave_barrier (#186512)

[X86] Move getMaskNode to avoid unnecessary forward declarations. (#186815)

I've also improved the assertions on the source / bool mask types to
catch bad use cases.

Cleanup pre-work to allow the i512 codegen to eventually use getMaskNode
instead of manual bool mask creations

Revert "[SLP]Fix legality checks for bswap-based transformations"

This reverts commit 2d4daea3b66469420fc164e76c15558b34e44c75 to fix
a buildbot https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flab.llvm.org%2Fbuildbot%2F%23%2Fbuilders%2F164%2Fbuilds%2F19737&data=05%7C02%7C%7C672461616e0d4b66614208de8374a0ff%7C84df9e7fe9f640afb435aaaaaaaaaaaa%7C1%7C0%7C639092734113272365%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=%2B23yMlvZzYt7bB2gM6MmcEwPkIKQogXjcKYIZ%2Bz79zQ%3D&reserved=0

[RISCV] Fold waddau/wsubau to waddu/wsubu when possible (#186635)

If the wide input is zero extended and only one narrow input is
used, we can fold to waddu/wsubu.

[WebAssembly] Support acquire-release atomics in CodeGen (#184900)

Set the correct memory ordering for relaxed atomics after ISel. This
allows
SelectionDAG to keep the simple generic selection for target-independent
AtomicLoad nodes, but keeps the ordering immediate correct in the MIR.
Notably, the MachineMemOperand still has the original memory ordering
and MIR passes would use that rather than the ordering immedate to make
their code motion decisions (if we had any for Wasm, which we don't).

Revert "[DomTree] Assert non-null block for pre-dom tree" (#186831)

Reverts llvm/llvm-project#186790

Breaks buildbots, there are more SLPVectorizer problems.

https://lab.llvm.org/buildbot/#/builders/52/builds/15810

[CIR][AArch64] Lower BF16 vduph lane builtins (#185852)

Part of #185382.

Lower `__builtin_neon_vduph_lane_bf16` and
`__builtin_neon_vduph_laneq_bf16` in ClangIR to `cir.vec.extract`,
and add dedicated AArch64 Neon BF16 tests.

This is my first LLVM PR, so I'd really appreciate any suggestions on
the implementation, test structure, or general LLVM contribution style.

[flang][parser] Add a feature flag for multiple program units on one line. (#186533)

This PR adds a feature flag `MultipleProgramUnitsOnSameLine` that by
default allows program units to be terminated by semicolons, and then
allow the next program unit to follow on the same line.

It also adds some test programs to demonstrate using programming units
and showing the portability warning with "-pedantic".

[X86] Add test showing failure to fold compress(splat(x),splat(x),mask) -> splat(x) (#186823)

Noticed while working on i512 shift expansion - if we end up with repeated splat args, we fail to remove the compress node

[libc][math] Refactored atanpif16 to header only (#184316)

Fixes #178105

Reapply "[clang][ssaf] Add --ssaf-extract-summaries= and --ssaf-tu-summary-file= options" (#186463)

This reverts commit 3548ec95178c00a2895a65b435945ce318396c8e and adapts
the code to the new ScalableStaticAnalysisFramework/ directory layout.

Re-adds:
- `TUSummaryExtractorFrontendAction` and its integration into `ExecuteCompilerInvocation`
- `--ssaf-extract-summaries=` and `--ssaf-tu-summary-file=` CLI options
- SSAFForceLinker / SSAFBuiltinForceLinker headers and anchor symbols
- Diagnostics under -Wscalable-static-analysis-framework
- Lit tests for the CLI and unit tests for the frontend action
- Changes the Formats to be lowercase - and match their spellings in the file paths.

[libc][math] Refactor bf16fma to Header Only (#182572)

Fixes #181625

[MIR][NFC] Test verbalising INLINEASM extra-info flags. (#186796)

Exposes the bug printing inteldialect.

[libc][math] Refactor log_bf16 to Header (#186618)

AMDGPU/GlobalISel: RegBankLegalize rules for ds_read_tr* (#186006)

AMDGPU/GlobalISel: RegBankLegalize rules for ctlz/cttz_zero_undef (#186546)

[X86] known-pow2.ll - add min/max vector test coverage for #182369 (#186841)

AMDGPU/GlobalISel: RegBankLegalize rules for s_wait intrinsics (#186254)

[InstCombine] Support disjoint or in add-sub reassociation fold (#186827)

[lldb] Include stdio.h in synthetic subscript test (#186847)

The [lldb-aarch64-windows](https://lab.llvm.org/buildbot/#/builders/141)
buildbot failed with:

```
lld-link: error: undefined symbol: printf
>>> referenced by main.o:(main)
```

I'm assuming that's because of the use of `__builtin_printf`. In other
tests, we use `printf` form `stdio.h` and these build fine, so I added
an include and used `printf`.

[AMDGPU][GlobalIsel] Add register bank legalization rules for amdgcn_wqm amdgcn_softwqm amdgcn_strict_wqm (#186214)

This patch adds register bank legalization rules for amdgcn_wqm
amdgcn_softwqm amdgcn_strict_wqm in the AMDGPU GlobalISel pipeline.

[flang] Reorder messages wrt line number before diff(actual, expect) (#186812)

When messages are attached together, the source locations to which they
refer are not necessarily monotonically increasing. For example
```
error: foo.f90:10: There is a problem here         # line 10
because: foo.f90:12: This thing is invalid         # line 12 (attached)
error: foo.f90:11: There is another problem here   # line 11
```
There is no way to represent that in the source file via ERROR
annotations, so before running unified_diff "canonicalize" the list of
messages into an order that corresponds to the line numbers.

---------

Co-authored-by: Michael Kruse <llvm-project at meinersbur.de>

[ForceFunctionAttrs] Fix handling of conflicts for more attributes (#186304)

Fixes #185277

ForceFunctionAttrs currently only checks the `alwaysinline`/`noinline`
conflict when forcing function attributes. This is incomplete, because
LLVM verifier rules define additional incompatible function attribute
combinations.

Extend hasConflictingFnAttr() to reject more conflicting function
attributes, including combinations involving `optnone`, `minsize`,
`optsize`, and `optdebug`.

Also add required companion attributes when forcing function attributes:
`optnone` requires `noinline`, so forceattrs now adds `noinline`
automatically when needed.

[lldb] When LLDB_ENABLE_MTE is ON always run the driver with MTE (#186322)

When LLDB_ENABLE_MTE is set to ON, we should always run the driver with
MTE by signing with the checked-allocations entitlement.

[clang][bytecode][NFC] Add Function::dump() taking no arguments (#186819)

Instead of relying on the default value. That one doesn't work properly
in lldb and I have to pass the `{}` explicitly every time.

[analyzer][NFC] Reorg and add clang::suppress tests (#186447)

This reorganizes the current clang suppression tests; and adds quite a
few more cases to the corpus.
This prepares the ground for #183727

While the test cases were generated by AI, I've personally checked every
single line and expectation.

Assisted-by: claude

[libc] Add myself as maintainer for Math, Threading and Runtime Safety (#186595)

[gn] port c5e5d5b282ded4

[flang][acc] Handle deduplicated use_device in ACCUseDeviceCanonicalizer (#186855)

The ACCUseDeviceCanonicalizer was attempting to remove `acc.use_device`
operation even when it was used in multiple constructs. This updates the
pass to remove it only when no longer used, which for the attached
example is after the handling of the second `acc.host_data` construct.

[sanitizer_common] Define SANITIZER_WEAK_IMPORT for Go race detector (#186525)

Currently, when building the Go race detector (when SANITIZER_GO
is set), SANITIZER_WEAK_IMPORT is no-op. It is perfectly fine to
define SANITIZER_WEAK_IMPORT for Go just like other cases. That
will tell the Go linker to treat _dyld_get_dyld_header as a weak
import.

Perhaps SANITIZER_WEAK_ATTRIBUTE can also be defined for Go. That
would be a separate patch.

[SPIRV][Matrix] Add support for Array Vector memory layout (#186215)

- fixes #179879
- Change is three fold:
1. Look for the Matrix Memory layout.
2. refactor out the common pieces of loadVectorFromArray into a helper
that can be shared with the matrix case.
3. The matrix case needs special indexing so we can do vector geps
instead of scalar geps that would require a 2d loop.

[OpenMP][NFC] Fix stale DeviceRTL header path in OpenMPIRBuilder (#185563)

The `\see` comment in `OpenMPIRBuilder.h` references
`openmp/libomptarget/deviceRTLs/common/include/target.h`. This file no
longer exists.

This patch updates the comment to point to the current correct header:
`openmp/device/include/Interface.h`.

[flang][OpenMP] Identify affected loops, provide reason (#185299)

Implement utility functions to calculate the number of affected loops in
a sequence or in a nest. Provide a reason for the returned value to be
used in an explanatory message.

Issue: https://github.com/llvm/llvm-project/issues/185287

[SLP]Fix legality checks for bswap-based transformations

Fix the checks for the non-power-of-2 base bswaps by checking the
power-of-2 of the source type, not the target scalar type. Plus, add
cost estimation for zext, if the source type does not match the scalar type and fixes final bitcasting for the reduced values.

Fixes https://github.com/llvm/llvm-project/pull/184018#issuecomment-4053477562

[llvm][Support] Fix an off-by-1 bug in YAML parser (#186731)

Closes #171620

Revert "[Format] Configure ASSIGN_OR_RETURN macros for Google style" (#186445)

Reverts llvm/llvm-project#169037

The change breaks formatting of real code containing ASSIGN_OR_RETURN
macros nested into lambdas. See
https://github.com/llvm/llvm-project/pull/169037#issuecomment-4056423543
for the test case.

[clang-format] Correctly annotate binary stars in braced init lists (#186732)

Fixes #175241

[clang-format] Identify include guard #endif followed by comments (#186848)

Fixes #176321

[SPIR-V] Fix scalarization of 1-element vector (#185529)

Previous patch #180735 didn't handle 1-element vectors nested in
aggregates.

[libc] Build fuzzing tests in pre-merge CI tests (#185018)

At the moment, no CI job tests whether the fuzzing tests build
correctly.

This patch adds the build of fuzzing tests to the pre-merge CI job.

Only two configurations have it enabled for now. The none-eabi
configurations seemingly do not support it because in their cmake
configs compiler-rt is not enabled, hence libFuzzer isn't built. I did
not dig too much to understand why that is, preferring to just leave it
disabled for these configurations. For the remaining ones that seem to
support it, I selected one x86 and one aarch64.

In addition, it removes one outdated comment about the build type used
and changes the action to run on all branches, not only on PRs that
target main.

If we limit it to run only on PRs to the main branch, it will not run on
stacked PRs. I believe it is also okay to run it on PRs to release
branches. Therefore it is just easier to remove the limit altogether.

[clang-format] Fix a bug in indenting lambda comments with only tabs (#186862)

Fixes #175151

[PowerPC][NFC] Refactor Register class and operand definitons (#185647)

Created a comprehensive base class system in PPCRegisterClasses.td to
eliminate repetitive RegisterOperand definitions across PowerPC register
files and introduced PPCRegOperand multiclass in to automatically
generate AsmOperandClass and RegisterOperand definitions, eliminating
~50 lines of boilerplate.

Asissted by AI.

[LLVM] [SeparateConstOffsetFromGEP] patch PR 183402 to handle negative C correctly (#186858)

Small typo in negative C threshold calculation would result in a
threshold that is too conservative due to overflow.

[Clang][Docs] Discontinue documenting the GCC -I- and --include-barrier options. (#184941)

Clang has never implemented the GCC `-I-` and `--include-barrier`
options. An error is issued if they are used. GCC deprecated these
options in GCC 4. Advertising their availability in documentation and
help text is misleading.

[AVR] Improve expansion of pseudo instruction SPREAD (#186780)

It would be better to set the higher register to zero for devices
without `SPH`.

[HLSL] Use 0 to represent unbounded resources (#186022)

SPIRV backend uses 0 to represent unbounded arrays. This patch makes
unbounded resources be represented with 0 when binding them, as well as
makes sure the backend uses OpTypeRuntimeArray to represent such cases.
Fix: https://github.com/llvm/llvm-project/issues/183367

[AArch64] Allocate two emergency spill slots for MTE to fix register … (#186505)

…scavenger crash

When `-sanitize=memtag-stack` is enabled and the compiler optimizes
contiguous ST2Gi instructions into an MTE loop (via
`TagStoreEdit::emitLoop`), it spawns two new post-RA virtual registers
simultaneously:
1. `BaseReg`
2. `SizeReg`

Under extremely high register pressure (such as in Swift async
continuation thunks, where almost all registers are kept live), the
Register Scavenger must fall back to using emergency spill slots to
assign physical registers to `BaseReg` and `SizeReg`.

Prior to this patch, `determineCalleeSaves` assumed that a maximum of
one register would ever need to be scavenged at a time. It either
allocated a single emergency spill slot, or bypassed the allocation
entirely if it found an unused Callee-Saved Register (`ExtraCSSpill`) to
use as a scratch register.

When the MTE loop asks for two registers simultaneously, the scavenger
crashes with:
`LLVM ERROR: Error while trying to spill LR from class GPR64: Cannot
scavenge register without an emergency spill slot!`

This patch fixes the crash by explicitly searching for mergeable STG
instructions during `determineCalleeSaves`. If found, it allocates a
second emergency spill slot. This guarantees the scavenger has enough
slots to successfully resolve both post-RA virtual registers, regardless
of whether the first scratch space came from an `ExtraCSSpill` or the
baseline emergency spill slot.

Added `memtag-emergency-spill-slot.mir` to test both the `ExtraCSSpill`
bypass path and the baseline zero-free-register path.

Assisted-by: claude

rdar://172501087

[Inliner] Fix return attribute propagation across multiple return sites (#186076)

Fixes #185159

This patch fixes a bug in `AddReturnAttributes()` where propagated
return attributes could incorrectly leak across multiple return sites in
the callee being inlined.

`AddReturnAttributes()` walks the callee's return instructions and tries
to backward-propagate return attributes from the callsite to the
returned call when the callee directly returns a call result. However,
the propagated attribute builders were updated in-place while iterating
over return sites. As a result, attributes refined for one return site
could be reused when
processing a later return site. This is incorrect because each return
site should be handled independently, starting from the original
callsite attributes.

This patch ensures that propagated return attributes are reinitialized
for each return site, so propagation is computed independently per
returned call.

[Clang][AArch64] Update comments in tests (nfc) (#186885)

[LV] Simplify and unify resume value handling for epilogue vec. (#185969)

This patch tries to drastically simplify resume value handling for the
scalar loop when vectorizing the epilogue.

It uses a simpler, uniform approach for updating all resume values in
the scalar loop:

1. Create ResumeForEpilogue recipes for all scalar resume phis in the
main loop (the epilogue plan will have exactly the same scalar resume
phis, in exactly the same order)
2. Update ::execute for ResumeForEpilogue to set the underlying value
when executing. This is not super clean, but allows easy lookup of the
generated IR value when we update the resume phis in the epilogue. Once
we connect the 2 plans together explicitly, this can be removed.
3. Use the list of ResumeForEpilogue VPInstructions from the main loop
to update the resume/bypass values from the epilogue.

This simplifies the code quite a bit, makes it more robust (should fix
https://github.com/llvm/llvm-project/issues/179407) and also fixes a
mis-compile in the existing tests (see change in
llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll,
where previously we would incorrectly resume using the start value when
the epilogue iteration check failed)

In some cases, we get simpler code, due to additional CSE, in some cases
the induction end value computations get moved from the epilogue
iteration check to the vector preheader. We could try to sink the
instructions as cleanup, but it is probably not worth the trouble.

Fixes https://github.com/llvm/llvm-project/issues/179407.

[lldb] Default LLDB_ENABLE_MTE to OFF when Sanitizers are enabled. (#186884)

The MTE launcher complicates injecting the sanitizer runtime libraries.

[clang][CMake] Fix ODR violation with LLVM_LINK_LLVM_DYLIB (#186689)

After 42b638c6b40d ("Propagate dependencies to OBJECT libraries in
add_llvm_library"), obj.clangSupport now inherits clangSupport's
LINK_LIBRARIES via target_link_libraries, which includes libLLVM.so when
LLVM_LINK_LLVM_DYLIB is enabled.

Previously the obj.clangSupport alias path was harmless because the
OBJECT library carried no link dependencies. Now, aliasing
clangSupport_tablegen to obj.clangSupport in DYLIB mode causes
clang-tblgen to transitively link libLLVM.so, while also having LLVM
symbols compiled in statically — triggering an ASan ODR violation on
globals like llvm::vfs::FileSystem::ID.

Fix by only propagating parts of the compile interface instead of the
full link interface - INTERFACE_INCLUDE_DIRECTORIES and
INTERFACE_SYSTEM_INCLUDE_DIRECTORIES. Also add a TODO to consider
replacing with target_link_libraries($<COMPILE_ONLY:tgt>) once minimum
CMake version is 3.27 or higher.

Revert "[flang][OpenMP] Use cuf.alloc for privatization of CUDA Fortr… (#186891)

…an device arrays (#185984)"

This reverts commit fb18d570b0466ca2a401aba11d6e58b206aebc1a.

This PR caused compilation failures with allocatable arrays, reverting
now for more investigation.

[SimplifyCFG] Allow phi folding for boolean logic over non-equality (#185124)

Phi folding is suppressed over binary operation inputs in order to avoid
interfering with switch formation.

After #183692, code (for example, Rust's ASCII character classification)
may get an `or` hoisted up into it, which suppresses
`foldTwoEntryPHINode`. This then produces branching code where
previously we generated straightline code.

To maintain switch formation while preventing any binops from breaking
phi folding, restrict the scenario in which Phi folding is suppressed to
binops of *equality* ops. This should mesh with switch statements, which
require an explicit list of values, while not breaking optimization over
> / < etc. which would never have been promoted to switches in the first
place.

Fixes: rust-lang/rust#153504

[Clang][Sema] Fix crash in CheckNonTypeTemplateParameterType with invalid type (#186200)

When a non-type template parameter has a type containing an undeduced
placeholder type that is invalid (e.g., a function returning a
function), `SubstAutoTypeSourceInfoDependent` can return null if the
type is invalid. `CheckNonTypeTemplateParameterType` was not handling
this case and would dereference the null pointer.

Fixes #177545

Revert "[AArch64] Allocate two emergency spill slots for MTE to fix register …" (#186900)

Reverts llvm/llvm-project#186505

Breaks buildbot

[clang] Fixed 'implicitly deleted' diagnostic for explicitly deleted candidate function (#186634)

When an explicit function template specialization is deleted, the
overload candidate `Fn` may be a non-canonical `FunctionDecl` where
`IsDeleted` is not set, even though the canonical decl has it set.
`isDeletedAsWritten()` reads `this` while `isDeleted()` reads
`getCanonicalDecl()`, causing the mismatch. Fix by using
`getCanonicalDecl()` consistently in the diagnostic.
Fixes #185693

AMDGPU: Annotate grid_dims ABI load with range metadata (#185610)

Also substitute with a constant for the reqd_work_group_size case.

[TableGen] Add new line to end of TreePatternNode::dump. (#186865)

[MLIR][Linalg] Add matchers to specialize more unary ops (#183259)

Add missing matchers to the `linalg.generic` specialization patterns to
handle the remaining named unary elementwise ops.

[bazel] Add libraries, binaries, and tests for ScalableStaticAnalysisFramework. (#186905)

[Clang][OpenMP] Move declare simd codegen into OMPIRBuilder (#186030)

Refactor declare simd codegen by moving logic that does not depend on
Clang declarations into OpenMPIRBuilder.

[AMDGPU] Include TRANS instructions in WMMA coexecution hazard checking (#186269)

[clang][ssaf] Workaround gcc-7 NRVO bug (#186897)

Addresses:
https://github.com/llvm/llvm-project/pull/186156#issuecomment-4070258854

Avoid assert in substqualifier (#182707)

What’s the problem:
Clang diagnoses the invalid static data member in a local class, but the
later out-of-line definition can still be instantiated and hit an
assertion.

What this PR does:
Mark the enclosing local class invalid when diagnosing the ill-formed
static
data member, so the invalid definition no longer survives to the
assertion path.

Fixes #176152

[bazel] Update lldb/BUILD.bazel for c5e5d5b (#186911)

[Reland][IR] Add initial support for the byte type (#186888)

This patch relands https://github.com/llvm/llvm-project/pull/178666. The
original version caused CI failures due to the missing target triple in
`llvm/test/CodeGen/X86/byte-constants.ll`. CI should be green now.

[MLIR][LLVM] add metadata attrs and `llvm.named_metadata` op (#186703)

This PR adds some LLVM metadata attributes and an `llvm.named_metadata`
container op (similar to `llvm.module_flags`) for those attributes.

Summary:

- Add MLIR attributes modeling LLVM IR metadata: `#llvm.md_string`,
`#llvm.md_const`, `#llvm.md_func`, and `#llvm.md_node`;
- Add `llvm.named_metadata` container op for module-level named metadata
nodes;
  - Add MLIR-to-LLVM-IR translation for the new attributes and op;
- Add C API functions (`mlirLLVMMDStringAttrGet`,
`mlirLLVMMDNodeAttrGet`, etc.);
- Add Python bindings (`llvm.MDStringAttr`, `llvm.MDConstantAttr`,
`llvm.MDFuncAttr`, `llvm.MDNodeAttr`, `llvm.FunctionType`).

[TargetLowering][X86] Directly emit FSHR from expandDIVREMByConstant when Legal. (#186863)

[LLDB] Replace file+filecheck with test in test (NFC) (#186875)

The "file" utility may not be installed:
https://green.lab.llvm.org/job/llvm.org/view/LLDB/job/lldb-ubuntu-aarch64/job/main/109/consoleText

[libclc] fix compiler check with --target=spirv64 and -disable-llvm-passes (#185376)

Fix "unknown target triple" errors when LLVM_TARGETS_TO_BUILD is empty.

Adding -disable-llvm-passes reduces this to a very basic sanity check
of Clang frontend. This allows the test to pass even if SPIR-V backend
is not enabled, as the frontend can still generate IR for the target.

[Support] Add option to use Windows vendored ICU (#186371)

Windows 10 provided [ICU C
API](https://learn.microsoft.com/en-us/windows/win32/intl/international-components-for-unicode--icu-)
since 1703, this PR adds support for it.

[X86] Improve illegal return type handling in FastISel (#186723)

Previously, FastISel would fall back to DAG ISel for any illegal return
type. This change adds a more precise check to determine if the ABI
requires a type conversion that FastISel cannot handle.

For example, bfloat is returned as f16 in XMM0, but FastISel would
assign f32 register type and store it in FuncInfo.ValueMap, causing DAG
to incorrectly perform type conversion from f32 to bfloat later.

However, i1 is promoted to i8 and returned as i8 per the ABI, so
FastISel
can safely lower it without switching to DAGISel. This change enables
FastISel to handle such cases properly.

---------

Co-authored-by: Yuanke Luo <ykluo at birentech.com>

[Clang] Fix a concept subsumption bug when template depths are adjusted (#186735)

We cannot reuse the cached normalization results if any template depth
adjustments (in subsumption checking) are involved.

Fixes https://github.com/llvm/llvm-project/issues/186624

[NFC] Remove unused InlineFunctionInfo::InlinedCalls (#186920)

Looks to be a relic of the legacy pass manager.

[llvm] [NFC] [test] Add ubsan feature for tests

Reviewers: boomanaiden154

Pull Request: https://github.com/llvm/llvm-project/pull/186918

[NFC] [MIR] mark inline-asm-extra-info as failing on UBSan

Reviewers: boomanaiden154

Reviewed By: boomanaiden154

Pull Request: https://github.com/llvm/llvm-project/pull/186919

[lldb] Add a decorator for arm64e (#186909)

[clang] use canonical arguments for checking function template constraints (#186889)

This is a partial revert of #161671, restoring the original behaviour
where the canonical template arguments are used for function template
constraint checking in diagnostics.

This reverts the fix from #183010, which attempted to fix #182344
but it causes regressions. These regressions now have test cases
included.

The attempt at #183010 is flawed because in the general case we can't
check satisfaction for constraints which have unsubstituted template
arguments, even if they don't affect the canonical type (ie they are
purely
syntactical), because these types can still turn out to be invalid after
substitution.

This is a problem when directly evaluating a concept specialization, but
it's not a problem with other template specializations because the
as-written types are preserved, and will be later substituted, and any
failures
here will cause the program to be ill-formed anyway.

The only downside of this revert is the loss of sugar in some
diagnostics.

This could be improved in the future by adding a new flag which would
allow ignoring this instantiation dependence in such cases where a
substitution failure will be handled later anyway.

This is not done in this patch because we want the safest thing
possible, to help backporting this patch to Clang 22.

This preserves the tests from #183010 and also adds the tests from

Since this fixes a regression since Clang 21 and will be backported to
Clang 22, there are no release notes.

[flang][PPC] Update vector tests with nuw nsw (NFC) (#186879)

[Clang][docs][test] Add N3517, N3652, and N3715 according to N3783 (#185566)

And test that these papers are not yet implemented.

[libclc][CMake] Use clang/llvm-ar on Windows (#186726)

When LLVM_TARGETS_TO_BUILD contains host target, runtime build sets
CMAKE_C_COMPILER to clang-cl on Windows.
Changes to fix build on Windows:
- libclc struggles to pass specific flags to clang-cl MSVC-like interface.
- compile flag handling will be consistent across all host systems.
- libclc build is cross-compilation for offloading targets.

[AMDGPU] Add s_sethalt to hasUnwantedEffectsWhenEXECEmpty (#186745)

We don't want to execute s_sethalt when all lanes want to skip it.

Co-authored-by: Junda Liu <Junda.Liu at amd.com>

[clang][OpenMP] Parse/Sema for OpenMP 6.0 declare_target 'local' clause (#186281)

Parse and perform semantic checks for declare_target 'local' clause.
When compiling for device offloading, generate a warning that 'local' is
not yet fully supported. On the host, 'local' is/will be a no-op, so no
warning is generated.

NOTE: The minimal CodeGen changes allow 'local' to flow through as
equivalent to the 'enter' clause after warning is generated.

Testing:
  - Updated messages and ast tests for declare target/declare_target
  - ninja check-all.

[llvm-ir2vec] Refactoring the ir2vec python bindings testing (#180664)

This is order to make it more thorough and cover the API and possible
exceptions better

[mlir][Interfaces][NFC] Improve time complexity of RegionBranchOpInterface canonicalization patterns (#186114)

Optimize RemoveDuplicateSuccessorInputUses in
`ControlFlowInterfaces.cpp`:

- Replace O(n² * k) pairwise comparison of successor inputs with O(n * k
* max(log k, log n)) signature-based grouping using `std::map`, where
_n_ is the number of successor inputs and _k_ is the number of
predecessors per input.

Assisted-by: Claude Code

---------

Co-authored-by: Yang Bai <yangb at nvidia.com>
Co-authored-by: Claude Opus 4.6 (1M context) <noreply at anthropic.com>

[ValueTracking] frem in computeKnownFPClass can not return +/-Inf (#186748)

`frem` only produces finite numbers or NaN, never +/-Inf. Before the
patch `computeKnownFPClass` failed to clear the `fcInf` mask for
`Instruction::FRem`, causing potential missed optimizations.

Fix #186746.

[RISCV] Fold (WADDAU -C, -1, rs1, 0) -> (WSUBU rs1, C) where C > 0 (#186638)

AArch64: Look through copies in CCMP converter.

The usual IR pattern for llvm.ptrauth.auth involves creating an
inttoptr for the auth operation to convert the result into a pointer.
CodeGenPrepare will copy these inttoptr operations into the user basic
blocks as this generally results in more efficient code. However, this is
not the case for the CCMP converter as it will encounter a COPY created
by the inttoptr and inhibit the optimization. Fix it by looking through
copies in the CCMP converter pass.

Assisted-by: gemini (wrote test)

Reviewers: davemgreen, fmayer, atrosinenko

Reviewed By: fmayer

Pull Request: https://github.com/llvm/llvm-project/pull/186842

[orc-rt] Add ControllerInterface symbol table. (#186947)

ControllerInterface holds the set of named symbols that the ORC runtime
exposes to the controller for bootstrapping the ExecutionSession.
Insertion is checked: duplicate symbols are rejected with an error.

Session is updated to own a ControllerInterface instance, pre-populated
with an orc_rt_SessionInstance entry pointing to the Session object.

[LV] Optimize x && (x && y) -> x && y (#185806)

This patch removes the extra logical-and in `x && (x && y)` and `x && (y && x)` to `x && y`.
This helps to simplify mask calculation in the FindLast reduction and
exposes more opportunities to replace to EVL.

PR link: https://github.com/llvm/llvm-project/pull/185806

[clang][bytecode] Clean up CondScope after while loop (#186816)

Similar to what we already do for for loops.

[RISCV][NFC] Remove duplicate setTargetDAGCombine registrations (#186928)

ISD::SRA and ISD::MUL are already registered unconditionally in the
constructor, so remove the redundant registrations from the
hasVInstructions() block. Fold the standalone SRA call into the existing
brace-initializer list.

---------

Co-authored-by: Claude Opus 4.6 <noreply at anthropic.com>

[CI][libclc] Enable libclc in premerge CI with single target (#186104)

Enable libclc build and test in the Linux premerge CI when libclc or .ci
files are modified.
To minimize build time, only build the amdgcn-amd-amdhsa-llvm target.

[orc-rt] Hold `const void*` rather than `void*` in ControllerInterface. (#186954)

We only care about addresses in ControllerInterface, not the underlying
memory.

[C++20] [Modules] Diagnose for duplicated definition in the same module (#186959)

Close https://github.com/llvm/llvm-project/issues/186603

[DA] Refactor the signature of the Exact SIV test (NFCI) (#186386)

Change the signature of `exactSIVtest` to directly pass addrecs instead
of passing their operands separately. I *think* this change is not
mandatory, but it will simplify the code, especially because we will be
checking the presence of nsw flags on the addrecs.

[lldb] Use clang_cl_host to build `vbases.test` (#186857)

[VPlan] Create header phis once regions have been created (NFC).

Since 1b29ac1d1857ea42273fc7862ea019a74a55195d, regions are constructed
as part of the scalar transforms; moving header phi creation after
region creation slightly simplifies the code.

Add zlib to Windows release build (#186630)

This PR adds zlib to the Windows release build script to enable zlib
support in LLVM.

Part 1 of https://github.com/llvm/llvm-project/issues/184177.

[NFC][SPIRV] New test for untested SPIRVInstructionSelector case (#186069)

[This
line](https://github.com/ambergorzynski/llvm-project/blob/main/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp#L2454)
is untested by the existing test suite (checked using coverage and
inserting an `abort` at that line).

We propose a new test for the untested logical `eq` case, similarly to
the `neq` case added
[here](https://github.com/llvm/llvm-project/commit/e45c8b6555c866cd0412b42fce0439e927ca3ba2).

[orc-rt] Fix stale file comment. NFC.

Reland "[lldb] Initial plugin and test for SymbolLocatorSymStore" (#185658)

Minimal infrastructure for a the SymbolLocator plugin that fetches debug
info from Microsoft SymStore repositories. This can work cross-platform
and for various debug info formats in principle, but the current plan is
focussed on PE/COFF on Windows with debug info in PDB files. Once we
have a stable first version, we'd like to add features like download,
environment variables, caching and progress feedback for users.

SymbolVendorPECOFF was tailored towards DWARF debug info so far. I added
code to load the PDB path from the executable (it only checked
gnu_debuglink so far) and not bail out if DWARF sections are missing, so
that in the PDB case we still call AddSymbolFileRepresentation() in the
very end of CreateInstance().

The API test in this patch mocks the directory layout from SymStore, so
it doesn't depend on SymStore.exe from the Windows SDK. It runs on all
platforms that link debug info in a PDB file, which is still just
Windows, but it could be cross-platform in principle.

-----

Relands with minor fixes: API tests create mocked SymStore in the test's
build directory. One log instruction was moved. One more object
access goes through module in SymbolFile.

libclc: Use struct for ep pair (#186973)

This will enable use with vector types

[llvm-link] Add more detail to `--internalize` description (#170397)

While the functionality of this flag is obvious in the implementation,
tool users may not know what it does with the short description
provided. Notably, it is not obvious from the short description that:

* Functions provided will be converted to internal linkage (and thus
discarded if unused) even if unreferenced.
* Functions in the first file will not be internalized, even if
referenced by a later one.

The Rust for Linux project has [found use for this
flag](https://lore.kernel.org/all/20251202-inline-helpers-v1-0-879dae33a66a@google.com/)
to support inlining `static inline` functions in C into code compiled by
Rust when `rustc` and `clang` share a LLVM.

[clang][ssaf] Add --ssaf-list-{extractor,format} flags (#185428)

These flags only work with the `clang` driver.
The `cc1` driver would ignore these flags.
Probably it could be implemented differently, but it's already better
than having nothing.

[DA] Add test for the Weak Crossing SIV test misses dependency (NFC) (#186355)

Add a test case where the value of `Delta` in the Weak Crossing SIV test
becomes the signed minimum, causing the test to miss the dependency.

libclc: Use fshr builtin in sincos helpers (#186427)

[OpenMP][OMPT] Add missing `error` entry to device tracing record union (#185683)

While `omp-tools.h` already includes the `ompt_record_error_t` struct,
the corresponding union entry was missing from `ompt_record_ompt_t`.
This commit adds the missing entry.

Note that this does not enable any functionality for device tracing
records.
This only aligns the struct with OpenMP v5.1 and newer. OpenMP v5.0 did
not contain the `error` directive.

CC: @jprotze

Signed-off-by: Jan André Reuter <j.reuter at fz-juelich.de>

[libc][docs] Update clang-tidy checks page (#185923)

Document layered .clang-tidy config, update llvmlibc-* check names, and
drop stale TODO

[VPlan] Factor collectGroupedReplicateMemOps (NFC) (#186820)

Factor out a collectGroupedReplicateMemOps from
collectComplementaryPredicatedMemOps, so it can be re-used in other
places.

libclc: Move edge case handling of trig functions (#186429)

The explicit handling of nan is unnecessary. Clamp infinities
to nan at the input. This allows optimizations of the following
implementation code to take advantage of the knowledge that it
does not need to handle infinities.

[AMDGPU] Use AMDGPULaneMaskUtils in SILowerI1Copies (#186170)

Use lane mask utils in lower i1 copies
No Functionality Change

libclc: Update pow functions (#186890)

The 4 flavors of pow were originally ported from rocm
device libs between c45ec604f593fcb03d770f4398142d2446017f68,
cc5c65b2c25e0a82fbad95f0ce3bb5262e29eeee, and
fe8e00bc3c65115b2e3d2a43cf3d0d756a934a52. Update to a newer
version. Additionally expose fast variants for use by the
libcall optimizer (e.g, __pow_fast) for float types.

libclc: Use frexp and ldexp in trig reduction instead of bit hacking (#186982)

libclc: Use small trig reduction for nan (#186983)

Nan should work on either path, but the small reduction
path is smaller. There's also possible codegen benefits to
knowing the large reduction will not need to handle nans.

libclc: Improve large float trig reduction (#186984)

[LV] Add select instruction to VPReplicateRecipe::computeCost (#186825)

I've added the Instruction::Select opcode to the existing list of
opcodes that call getCostForRecipeWithOpcode. There are currently 5
tests that ask for the cost of the select:

  Transforms/LoopVectorize/AArch64/widen-gep-all-indices-invariant.ll
  Transforms/LoopVectorize/first-order-recurrence-with-uniform-ops.ll
  Transforms/LoopVectorize/narrow-to-single-scalar.ll
  Transforms/LoopVectorize/replicate_fneg.ll
  Transforms/LoopVectorize/single-scalar-cast-minbw.ll

The fact they all pass with this change is hopefully proof enough that
the costs are correct.

[Clang] Make members with exclude_from_explicit_instantiation never be exported or imported (#185140)

This patch extends the `exclude_from_explicit_instantiation` attribute
to work in non-template contexts, despite of its name.

The attribute now has a simple semantics: "Excluded members will never
be exported or imported."

[clang][win] Define vector deleting dtor body for declared-only dtor if needed (#185653)

Currently vector deleting destructor body emission is triggered if new[]
is called for the type and if the destructor or the whole class is
marked with attribute dllexport. The problem is that it is not emitted
if new[] is called, destructor is not exported and it is only declared
in the TU and defined in another. That makes vector deleting destructor
body missing which leads to runtime failures in delete[].

This change forces vector deleting destructor body emission if new[] is
called even if the destructor is only declared but not defined. Doing
that replicates MSVC behavior and fixes runtime issues. Since vector
deleting destructors have weak linkage, it should be safe to do so.

Fixes https://github.com/llvm/llvm-project/issues/183255

AI usage: Claude was used to create LIT test cases which then were
reviewed and reworked by me.

[MIR] Fix printing INLINEASM dialects. (#186797)

[libclc] Fix check-libclc dependency on llvm-dis (#186978)

Add llvm-dis to libclc runtime dependencies.

[lldb][PlatformDarwin] Reword warning when locating scripting resources from dSYM (#185666)

This patch makes the warning message more concise (in my opinion). We
would duplicate the file path multiple times in the message. I'm
planning on factoring this logic into a standalone function, and having
it rely on fewer parameters helps with that.

Before:
```
warning: the symbol file '/path/to/.dSYM/Contents/Resources/DWARF/import' contains a debug script.
However, its name '/path/to/.dSYM/Contents/Resources/DWARF/../Python/import.py' conflicts with a keyword
and as such cannot be loaded. LLDB will load '/path/to/.dSYM/Contents/Resources/DWARF/../Python/_import.py' instead.
Consider removing the file with the malformed name to eliminate this warning.
```

After:
```
warning: debug script '/path/to/.dSYM/Contents/Resources/DWARF/../Python/import.py' cannot be loaded
because 'import.py' conflicts with the keyword 'import'. Ignoring 'import.py' and loading '_import.py' instead.
```

Before:
```
warning: the symbol file '/path/to/.dSYM/Contents/Resources/DWARF/import' contains a debug script.
However, its name conflicts with a keyword and as such cannot be loaded.
If you intend to have this script loaded, please rename '/path/to/.dSYM/Contents/Resources/DWARF/../Python/import.py'
to '/path/to/.dSYM/Contents/Resources/DWARF/../Python/_import.py' and retry.
```

After:
```
warning: debug script '/path/to/.dSYM/Contents/Resources/DWARF/../Python/import.py' cannot be loaded
because 'import.py' conflicts with the keyword 'import'. If you intend to have this script loaded, please rename
it to '_import.py' and retry.
```

[AArch64] Remove promotion cost for fixed-length bfloat arith with +sve-b16b16 (#186378)

These operations can be trivially promoted to SVE (with the addition of
a ptrue).

[flang][NFC] Converted five tests from old lowering to new lowering (part 33) (#186943)

Tests converted from test/Lower/Intrinsics: ieee_operator_eq.f90,
ieee_signbit.f90, index.f90, iparity.f90, is_contiguous.f90

[mlir][bytecode] Fix crashes when reading bytecode with unsupported types (#186354)

When using test-kind=2 in the bytecode roundtrip test, integer types
(i32) are replaced by a custom type (TestI32Type) via a type callback.
This exposed two crash scenarios:

1. Reading IntegerAttr with an unsupported type: `getIntegerBitWidth`
returns 0 for unsupported types and emits an error, but
`readAPIntWithKnownWidth` would proceed to call
`reader.readAPIntWithKnownWidth(0)`, creating a zero-width APInt with a
potentially non-zero value. Fix: early-return failure when `bitWidth ==
0`.

2. Reading VectorType with an unsupported element type:
`VectorType::get` asserts that the element type implements
VectorElementTypeInterface. When the element type is replaced by a
custom type that doesn't implement this interface, the program crashes.
Fix: use `VectorType::getChecked` with a diagnostic emitter lambda
instead of `get<VectorType>` in the bytecode builder.

Fixes #128312

[orc-rt] Move SPS controller interface funcs into their own headers. (#186991)

This provides clean separation between the ORC runtime code that
implements runtime functionality and the wrapper functions that permit
this code to be called from the controller via the
ExecutorProcessControl API.

Separating the controller interface from the implementation functions
should allow clients to introduce alternative serialization schemes if
they want (e.g. JSON).

In particular, this commit adds a new orc-rt/include/orc-rt/sps-ci
directory and moves SimpleNativeMemoryMap SPS controller interface into
a new header in that directory. This commit also splits the
implementation and testing of the SPS controller interface for
SimpleNativeMemoryMap into separate files.

[orc-rt] Update SPS wrapper names to reflect new namespace. NFCI. (#186994)

[mlir][arith][NFC] Use type parser instead of hard-coding type keywords (#186753)

Parse type literals instead of hard-coding them in a switch-case
statement.

Assisted by: claude-opus-4.6

[LLVM] Make -use-constant-fp-for-scalable-splat the default. (#186422)

Includes a trivial fix to ReassociatePass::OptimizeAdd to force the type of
new ConstantFP nodes.

[MIR] Support symbolic INLINEASM extra-info flags (#186818)

[mlir][gpu] Add SymbolUserOpInterface to launch_func op (#173277)

The gpu.launch_func is an operation that performs symbol references.
Currently, its symbol validation logic is implemented within
GPUDialect::verifyOperationAttribute. To improve the clarity and
structure of the validation logic, this PR makes LaunchFuncOp implement
the SymbolUserOpInterface. In addition, implementing this interface
allows the operation to benefit from various symbol-usage analysis
passes.

[WebAssembly] combine `bitmask` with `setcc <X>, 0, setlt` (#179065)

The rust `simd_bitmask` intrinsic is UB when the lanes of its input are
not either `0` or `!0`, presumably so that the implementation can be
more efficient because it could look at any bit. To get the "mask of
MSB" behavior of webassembly's `bitmask`, we would like to simply first
compare with a zero vector.

```llvm
define i32 @example(<2 x i64> noundef %v) {
entry:
  %1 = icmp slt <16 x i8> %v, zeroinitializer
  %2 = bitcast <16 x i1> %1 to i16
  %3 = zext i16 %2 to i32
  ret i32 %3
}
```

On x86_64, this additional comparison optimizes away, but for wasm it
does not.

https://godbolt.org/z/T5sPejocs

This PR adds a new combine, so that instead of emitting

```asm
example:
        local.get       0
        v128.const      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        i8x16.lt_s
        i8x16.bitmask
        end_function
```

we just emit

```
example:
        local.get       0
        i8x16.bitmask
        end_function
```

[mlir] Fix crash in diagnostic verifier for unmatched @unknown expectations (#186148)

When an expected-* directive uses the @unknown location specifier, the
associated ExpectedDiag record has an invalid (null) SMLoc as its
fileLoc. If the expected diagnostic is never produced, emitError() is
called to report the unmatched expectation, but it unconditionally
constructs an SMRange from fileLoc, triggering a null-pointer
dereference (UBSan) and an assertion failure in SMRange's constructor
which requires both endpoints to have equal validity.

Fix by guarding the SMRange construction with a fileLoc.isValid() check.
When fileLoc is invalid, call PrintMessage without a source range.

Fixes #163343

Assisted-by: Claude Code

[NFC][NVPTX] Fix tcgen05.mma PTX instruction encoding (#186602)

.ashift should be before .collector::a::* according to PTX ISA.

ptxas accepts both orderings, but the spec-correct order is used now.

[mlir][bufferization] Fix crash with copy-before-write + bufferize-function-boundaries (#186446)

When `copy-before-write=1` is combined with
`bufferize-function-boundaries=1`, `bufferizeOp` creates a plain
`AnalysisState` (not `OneShotAnalysisState`) and passes it to
`insertTensorCopies`. Walking `CallOp`s during conflict resolution
called `getCalledFunction(callOp, state)`, which unconditionally cast
the `AnalysisState` to `OneShotAnalysisState` via `static_cast`, causing
UB and a stack overflow crash.

Fix by guarding the cast with `isa<OneShotAnalysisState>()` so that when
the state is a plain `AnalysisState`, the function falls through to
building a fresh `SymbolTableCollection` — the same safe fallback
already present.

Fixes https://github.com/llvm/llvm-project/issues/163052

Assisted-by: Claude Code

[mlir][vector] Add missing tests (nfc) (#186990)

Currently, `ConvertVectorToLLVM` rejects strided memrefs when lowering
`vector.gather` and `vector.scatter`. This PR adds tests to document
that behavior.

Supporting strided memrefs in the lowering is left as future work.
However, it is still unclear whether gather/scatter on strided memrefs
should be supported at all (see the Discourse discussion [1]).

This PR also adds tests for `vector.load` and `vector.store` in
`invalid.mlir` to document that these ops do not support strided
memrefs.

[1] https://discourse.llvm.org/t/rfc-semantics-of-vector-gather-indices-with-strided-memrefs

[Bazel] Port 55b271d (#187007)

[lldb][PlatformDarwin][NFC] Move logic to emit warning on invalid/conflicting Python script names into helper function (#185669)

Depends on:
* https://github.com/llvm/llvm-project/pull/185666
* https://github.com/llvm/llvm-project/pull/185627

I'm planning on re-using this logic for a different API. Hence move it
into a common helper.

[LAA] Add tests with missed aliasing invariant load/store. (NFC)

Add a set of tests showing incorrect LAA results based on
https://github.com/llvm/llvm-project/issues/186922.

[Bazel] Port 9e43b35 (#187011)

[X86][APX] Enable NDD tunings (#186049)

For latest Intel processors with APX, mem form of all NDD instructions
(except for RIP based addressing) and imm form of NDD add/sub (include
inc/dec) instructions needed to be turned off by default for optimal
hardware performance.

Two new tunings enable-ndd-mem and enable-ndd-imm was added then
disabled by default. The new isa attributes is adopted for different
alternatives for NDD related patterns to control the generation of
mem/imm form.

[X86] Improve handling of i512 SHL(-1,Amt) + SRL(-1,Amt) "mask shifts" (#186806)

An extension of the existing one-bit shift patterns - perform an initial
select to handle 'allones/allzeros' elements and then insert the element
that has a partial mask on top of it.

Often turns up in bit manipulation patterns

[AMDGPU] Remove AMDGPUCallLowering dependency on AMDGPUTargetLowering. NFC. (#187008)

[lldb][windows] fix TestReplaceDLL.py reruns (#187002)

[mlir][tosa] Remove 'Pure' trait from operations that are not speculatable (#185700)

This commit removes the 'Pure' trait from a number of TOSA operations.
Instead of marking most ops as pure by default, the trait is now opt-in
for operations that are provably side-effect free and speculatable.

Several operations were previously marked as pure unintentionally.

The following operations have had 'Pure' removed (reason in brackets):
- ARGMAX (out-of-range index)
- AVG_POOL2D (accumulator overflow/underflow)
- AVG_POOL2D_ADAPTIVE (same as above)
- CONV2D (accumulator overflow/underflow)
- CONV2D_BLOCK_SCALED (accumulator overflow/underflow)
- CONV3D (accumulator overflow/underflow)
- DEPTHWISE_CONV2D (accumulator overflow/underflow)
- MATMUL (accumulator overflow/underflow)
- MATMUL_T_BLOCK_SCALED (accumulator overflow/underflow)
- TRANSPOSE_CONV2D (accumulator overflow/underflow)
- ADD (overflow)
- SUB (underflow)
- MUL (invalid shift, overflow)
- ARITHMETIC_RIGHT_SHIFT (invalid shift value)
- LOGICAL_LEFT_SHIFT (invalid shift value)
- LOGICAL_RIGHT_SHIFT (invalid shift value)
- INTDIV (division by zero)
- POW (negative exponent restrictions)
- TABLE (invalid slope computation)
- ABS (underflow)
- NEGATE (overflow/underflow)
- REDUCE_PRODUCT (overflow)
- REDUCE_SUM (overflow)
- GATHER (out-of-range indices)
- SCATTER (out-of-range or duplicate indices)
- RESCALE (overflow/underflow)

Many of these operations can exhibit undefined behaviour when a
`REQUIRE` condition in the TOSA specification pseudocode fails. Whether
such failures result in a runtime error is implementation-defined. As a
result, speculating or reordering these operations can change program
behaviour.

For this reason, the `AlwaysSpeculatable` property implied by `Pure` is
not valid for these ops. The `NoMemoryEffect` trait is retained, as
these operations do not have direct memory side effects.

[LifetimeSafety] Exclude basic_string::insert from capturing methods (#186989)

Fixes https://github.com/llvm/llvm-project/issues/186817

[SPIR-V] Handle undef aggregate initializers for global variables (#186785)

Expand undef aggregate global initializers into per-element spv_undef
intrinsics

[Flang][OpenMP] Add semantic support for OpenMP Loop Interchange and permutation clause in Flang (#183435)

This patch adds semantics for the `omp interchange` directive in flang
and the permutation clause, as specified in OpenMP 6.0.
Relevant tests have been added in every step.

[SPIR-V] Handle spirv.MemoryModel metadata (#186138)

[NFC][analyzer] Improve computeObjectUnderConstruction (#186186)

Previously the method `ExprEngine::computeObjectUnderConstruction` took
a `NodeBuilderContext` parameter which was only used to call its
`blockCount()` method; this commit replaces this with directly taking
`NumVisitedCaller` (= number of times the caller was visited, the
`blockCount`) as an unsigned value.

In `CallEvent::getReturnValueUnderConstruction` this method is invoked
with `getNumVisitedCurrent()`, the visitation count of the _current_
`LocationContext` and `Block`; instead of calling `getNumVisited()` on
the `LocationContext` and `Block` corresponding to the `CallEvent`
instance (available through its data members). This is logically
incorrect, but (at least within the lit testsuite) there is no situation
where it leads to actually incorrect behavior. This is currently marked
with a FIXME comment; it will be fixed in a follow-up commit.

[libc++] Build the library with C++26 (#181021)

All supported compilers support C++26. This allows simplifying some of
the upcoming <text_encoding> implementation.

[sancov] add -fsanitize-coverage=trace-pc-entry-exit (#185972)

Add a SanCov flag for calling dedicated hook functions on function entry
and exit. This flag can be used either in combination with
-fsanitize-coverage=trace-pc (in which case this patch changes which
hook is called for the entry BB, and generates an additional hook call
before return), or it can be used by itself (in which case only the
dedicated entry/exit callbacks are invoked).

This can be used to track the call stack throughout a sancov trace.

cc @vitalybuka @dvyukov

[lldb] Add pointer arithmetics for addition and subtraction to DIL (#184652)

[MLIR][Interfaces] Make `getMutableSuccessorOperands` overridable on `ReturnLike` ops (#186832)

Move the `getMutableSuccessorOperands` implementation from `ReturnLike`
trait to the `RegionBranchTerminatorOpInterface` to allow overriding of
the implementation. This allows to have the trait on operations that are
not a return of all of their operands. This can be used, for example, to
implement custom `ReturnLike` terminator that consumes non-returned
operands in combination with `func.func`.

The `RegionBranchTerminatorOpInterface` now provides a default
implementation for the `getMutableSuccessorOperands` method that returns
all of the operands.

[Bazel] Port 429e9717 (#187019)

[ValueTracking] fadd never produces subnormal with no underflow (#186985)

In the cases where the fadd does not underflow, the result is both
non-zero and non-subnormal.

alive2 results for the added positive testcases:

testcase 1: https://alive2.llvm.org/ce/z/Mxjott

testcase 2: https://alive2.llvm.org/ce/z/Q-_A-v

testcase 3: https://alive2.llvm.org/ce/z/Y3XpSk

testcase 4: https://alive2.llvm.org/ce/z/34n8MZ

Fix #186975.

[NFC][analyzer] Update some incorrect doc-comments (#186852)

These were incorrectly copy-pasted more than ten years ago.

[AMDGPU] Remove R600TargetTransformInfo dependency on AMDGPUTargetLowering. NFC. (#187014)

[CIR][AArch64] Upstream NEON shift left builtins (#186406)

This PR adds CIR generation for the following AArch64 NEON builtins:

__builtin_neon_vshld_n_s64 and __builtin_neon_vshld_n_u64 (constant
shifts)

extracted the constant value and use it directly for shift left

__builtin_neon_vshld_s64  and __builtin_neon_vshld_u64 (variable shifts)
there is an existing function to handles SISD (SIngle Instruction Single
Data), reusing this to create the right CIR instructions

__builtin_neon_vshld_s64 -- call i64 @llvm.aarch64.neon.sshl.i64(i64
[[A]], i64 [[B]])
__builtin_neon_vshld_u64 -- call i64 @llvm.aarch64.neon.ushl.i64(i64
[[A]], i64 [[B]])

added test cases in intrinsics.c by looking at the test cases present in

https://github.com/llvm/llvm-project/blob/main/clang/test/CodeGen/AArch64/neon-shifts.c

before adding the code it gave a not implemented error and after adding
the code changes the error is not present the code succeeds.

ran the test cases using
```
bin/llvm-lit -v  \
  /Users/albertbolt/projects/llvm-project/clang/test/CodeGen/AArch64/neon/intrinsics.c
```

---------

Co-authored-by: Andrzej Warzyński <andrzej.warzynski at gmail.com>

[AMDGPU][GlobalISel] Add RegBankLegalize rules for atomic fmin/fmax (#182824)

Add register bank legalization rules for
G_ATOMICRMW_FMIN/G_ATOMICRMW_FMAX (flat, global, LDS) and
G_AMDGPU_BUFFER_ATOMIC_FMIN/G_AMDGPU_BUFFER_ATOMIC_FMAX (S32 and S64)
under -new-reg-bank-select. Update existing GlobalISel tests to use the
new pass and add a new MIR test for register bank assignment.

Reland "[DomTree] Assert non-null block for pre-dom tree" (#187005)

Reland #186790 with fix for SCEV. A loop can have more than one latch,
in which case getLoopLatch returns null.

[libc++][NFC] Rename the template parameter of __make_transparent (#186435)

Renaming from _Tp to _ArgumentType makes it clearer that we're passing
the argument type intended for the comparator, which allows checking
whether *that specific use* of the comparator would be transparent.

Fixes #186396

[NFC][AArch64] ConditionOptimizer: refine cmp/cond instruction update code (#186724)

Split modifyCmp() into updateCmpInstr() and updateCondInstr() to
separate the concerns of updating the compare and its controlling
conditional. Rename parseCond() to parseCondCode() and return the
CondCode directly rather than via an out-parameter.

Also add applyCmpAdjustment() to pair the two update calls at a higher
level of abstraction, reducing call site verbosity.

[SPIR-V] Add support for arbitrary precision integer constants in instruction printer (#185306)

This PR improves the SPIR-V instruction printer output for integer
constants using `SPV_ALTERA_arbitrary_precision_integers` extension.

Previously, when `OpConstantI` was encoded with multiple 32-bit words
(for integer widths > 64), the inst printer printed the raw per-word
immediates. This was hard to read and did not reflect the actual value
at the declared integer bitwidth.

Now, with the change in this patch, the instruction printer reconstructs
the multi-word literal into a single `APInt`, truncates it to the
bitwidth declared by the corresponding `OpTypeInt`, and prints the
resulting value as one readable integer (including correct negative
values where applicable).

---------

Co-authored-by: yixing.zhang <yixingzh at smtp.igk.intel.com>

[lldb] Fix permission issue in API test on lldb-x86_64-win (#187021)

Deleting the executable at the end of this API test-case fails with a
permission error, likely because lldb still holds a reference to the
EXE. Exit explicitly to avoid that.

[flang][OpenMP] Use the LoopSequence-based checks (#185300)

Turn on the new loop-construct checks. Remove some checks from
resolve-directives.cpp which are now done in the semantic analysis.
Update tests.

Issue: https://github.com/llvm/llvm-project/issues/185287

 [RISCV] Select (sext_inreg (sra X, C), i8/i16) as slli+srai. (#186956)

Without Zcb, the slli+srai may be more compressible

[Instrumentation][nsan] Intrinsic tests + bugfixes (#186803)

Add comprehensive lit-test coverage to the NumericalStabilitySanitizer's
intrinsic and libfunc handling.

Also, several minor bugfixes:
 - Fix typo in llvm.nearbyint.f80's name.
- Remove lround and llround intrinsics, since they return ints (and are
not instrumented by nsan as a result).
 - Fix fmuladd.f64 intrinsic to map to fmuladd instead of fma.
 - Add missing fmuladd.f80 variant.

[libc] Add Jeff Bailey to Maintainers.rst (#186662)

Add Jeff Bailey as a maintainer for General Maintenance and
Documentation. Jeff has been contributing to LLVM-libc since January
2022. This addition was discussed at the most recent LLVM-libc meeting
with no objections raised.

[analyzer] Fix [[clang::suppress]] for nested templates (#183727)

For nested templates, we might need to walk the member template chain to
get to the primary template. This can be an arbitrary long chain, of the
partial specializations.

Previously, we hit the assertion `This class template must have a redecl
that is a definition` because we only walked the redecls of the given
template. However, that redecl-chain might not have a redecl that is the
definition.
Sometimes (in case of member templates) you also need to walk the member
specialization and continue walking the redecls of that one.

(Don't ask me more, because I have no clue how these nested templates
and their redecls are implemented. It just works™️)

Fixes #182659
Assisted-by: claude

[lldb][PlatformDarwin][test] Move Platform test utilities into common header for re-use (#187036)

In https://github.com/llvm/llvm-project/pull/187031 I'm planning on
re-using the `MockScriptInterpreterPython` and `CreateFile` API from a
different unit-test in `Platform/`.

AMDGPU: llvm.amdgcn.ds.add/sub.gs.reg.rtn are sources of divergence (#186883)

Per the ISA documentation, these are atomic operations on dedicated GS
streamout registers. As GDS instructions, the first active lane (based
on EXEC) is used and others are ignored.

[AMDGPU][GlobalISel][NFC] Group RegBankLegalize intrinsic rules (#186912)

[CodGen] Port UnpackMachineBundles to new pass manager (#184918)

Add Zstandard to Windows release build (#186772)

This PR adds Zstandard to the Windows release build script to enable
Zstandard support in LLVM.

Part 2 of https://github.com/llvm/llvm-project/issues/184177.

This PR supersedes #186631, which got closed because of a force push
mishap.

[libc] Avoid host header collisions in full builds (-nostdinc) (#187025)

When building the full library with -nostdinc, directly including
<stdint.h> may pull in host or compiler-provided headers that collide
with LLVM-libc's local macro definitions. Switch to using our internal
stdint-macros.h when LIBC_FULL_BUILD is enabled.

Additionally, declare aligned_alloc with noexcept in C++ to match common
C library declarations and avoid fatal type specification mismatches
during sysroot builds.

[X86] Fold compress(splat(x),splat(x),mask) -> splat(x) (#187042)

Noticed while working on i512 shift expansion - if we end up with
repeated splat args, the compress node is unnecessary as we're just
shuffling the same element values

[libc++] Add scripts defining two LNT runners for libc++ (#187050)

refactor code
---
 .ci/compute_projects.py                       |   12 +-
 .ci/compute_projects_test.py                  |   18 +-
 .ci/monolithic-linux.sh                       |    1 +
 .ci/monolithic-windows.sh                     |    4 +-
 .github/actions/build-container/action.yml    |    2 +-
 .github/actions/push-container/action.yml     |    2 +-
 .../workflows/build-ci-container-windows.yml  |    4 +-
 .github/workflows/ci-post-commit-analyzer.yml |    2 +-
 .github/workflows/commit-access-review.yml    |    2 +-
 .github/workflows/containers/libc/Dockerfile  |    3 +
 .github/workflows/docs.yml                    |    2 +-
 .github/workflows/email-check.yaml            |    2 +-
 .github/workflows/ids-check.yml               |    2 +-
 .github/workflows/libc-fullbuild-tests.yml    |   77 +-
 .github/workflows/libclang-abi-tests.yml      |    8 +-
 .github/workflows/libcxx-build-and-test.yaml  |    8 +-
 .github/workflows/libcxx-build-containers.yml |    4 +-
 .github/workflows/llvm-abi-tests.yml          |   12 +-
 .github/workflows/pr-code-format.yml          |    2 +-
 .github/workflows/pr-code-lint.yml            |    2 +-
 .github/workflows/premerge.yaml               |    8 +-
 .github/workflows/prune-branches.yml          |    4 +-
 .github/workflows/release-asset-audit.yml     |    4 +-
 .github/workflows/release-binaries.yml        |    2 +-
 .github/workflows/release-documentation.yml   |    2 +-
 .github/workflows/release-sources.yml         |    2 +-
 .github/workflows/release-tasks.yml           |    2 +-
 .github/workflows/scorecard.yml               |    2 +-
 .../test-unprivileged-download-artifact.yml   |    4 +-
 .../upload-release-artifact/action.yml        |    4 +-
 .../misc/MultipleInheritanceCheck.cpp         |   35 +-
 .../InefficientStringConcatenationCheck.cpp   |   10 +-
 .../performance/UseStdMoveCheck.cpp           |    7 +-
 .../clang-tidy/readability/CMakeLists.txt     |    1 +
 .../readability/ElseAfterReturnCheck.cpp      |   25 +-
 .../ImplicitBoolConversionCheck.cpp           |    2 +-
 .../readability/ReadabilityTidyModule.cpp     |    3 +
 .../RedundantQualifiedAliasCheck.cpp          |  220 ++
 .../RedundantQualifiedAliasCheck.h            |   40 +
 clang-tools-extra/clangd/FindTarget.cpp       |    4 +-
 .../unittests/SemanticHighlightingTests.cpp   |    8 +-
 clang-tools-extra/docs/ReleaseNotes.rst       |   25 +
 .../docs/clang-tidy/checks/list.rst           |    1 +
 .../inefficient-string-concatenation.rst      |    4 +-
 .../readability/redundant-qualified-alias.rst |   30 +
 .../checkers/Inputs/Headers/std/cstddef       |    2 +
 .../checkers/Inputs/Headers/std/memory        |  170 ++
 .../Inputs => Inputs/Headers/std}/type_traits |   26 +-
 .../checkers/Inputs/Headers/std/utility       |   42 +-
 .../checkers/abseil/Inputs/cstddef.h          |   10 -
 .../checkers/abseil/Inputs/initializer_list   |   11 -
 .../bugprone/shared-ptr-array-mismatch.cpp    |   12 +-
 .../bugprone/unhandled-self-assignment.cpp    |   19 +-
 .../bugprone/unique-ptr-array-mismatch.cpp    |   23 +-
 .../checkers/bugprone/unused-return-value.cpp |   14 +-
 .../checkers/bugprone/use-after-move.cpp      |   30 +-
 .../avoid-const-or-ref-data-members.cpp       |    7 +-
 .../checkers/misc/multiple-inheritance.cpp    |   19 +
 .../checkers/misc/uniqueptr-reset-release.cpp |   16 +-
 .../Inputs/replace-auto-ptr/memory.h          |   45 -
 .../modernize/Inputs/smart-ptr/shared_ptr.h   |   34 +-
 .../modernize/Inputs/smart-ptr/unique_ptr.h   |   29 +-
 .../checkers/modernize/make-shared-header.cpp |    5 +-
 .../checkers/modernize/make-unique-cxx11.cpp  |    6 +-
 .../modernize/make-unique-default-init.cpp    |    6 +-
 .../checkers/modernize/make-unique-header.cpp |    5 +-
 .../checkers/modernize/make-unique-macros.cpp |    2 +-
 .../checkers/modernize/replace-auto-ptr.cpp   |    4 +-
 .../checkers/modernize/use-emplace.cpp        |    7 +-
 .../checkers/modernize/use-ranges.cpp         |    2 +-
 .../inefficient-string-concatenation.cpp      |    6 +
 .../performance/move-constructor-init.cpp     |   10 +-
 .../checkers/performance/use-std-move.cpp     |   15 +
 ...us-smartptr-reset-call-custom-pointers.cpp |    5 +-
 .../ambiguous-smartptr-reset-call.cpp         |    5 +-
 .../readability/const-return-type.cpp         |    9 +-
 .../readability/container-data-pointer.cpp    |   22 +-
 .../readability/container-size-empty.cpp      |    9 +-
 .../readability/else-after-return-cxx20.cpp   |   22 +
 .../readability/else-after-return.cpp         |   47 +-
 ...it-bool-conversion-allow-in-conditions.cpp |    8 +
 .../readability/redundant-qualified-alias.cpp |  203 ++
 .../redundant-smartptr-get-macros.cpp         |   12 +-
 .../readability/redundant-smartptr-get.cpp    |   35 +-
 .../readability/uniqueptr-delete-release.cpp  |   22 +-
 clang/CMakeLists.txt                          |   38 +
 clang/docs/ClangFormatStyleOptions.rst        |  112 +-
 clang/docs/LibTooling.rst                     |    4 +-
 clang/docs/LifetimeSafety.rst                 |  609 ++++++
 clang/docs/ReleaseNotes.rst                   |    7 +
 clang/docs/SanitizerCoverage.rst              |   16 +
 .../user-docs/SummaryExtraction.rst           |    4 +
 clang/docs/index.rst                          |    1 +
 clang/include/clang/AST/ASTContext.h          |    9 +-
 clang/include/clang/Basic/Attr.td             |   20 +-
 clang/include/clang/Basic/AttrDocs.td         |   25 +-
 clang/include/clang/Basic/CodeGenOptions.def  |    2 +
 .../clang/Basic/DiagnosticFrontendKinds.td    |   23 +
 clang/include/clang/Basic/DiagnosticGroups.td |    3 +
 .../clang/Basic/DiagnosticParseKinds.td       |   15 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |   22 +-
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |   23 +-
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  185 +-
 .../clang/CIR/Interfaces/CIROpInterfaces.td   |   21 +
 clang/include/clang/Config/config.h.cmake     |    6 +
 clang/include/clang/Format/Format.h           |  155 +-
 .../include/clang/Frontend/FrontendOptions.h  |   18 +-
 clang/include/clang/Options/Options.td        |   47 +-
 .../Core/Serialization/JSONFormat.h           |    4 -
 .../SerializationFormatRegistry.h             |   18 +-
 .../Core/SummaryData/SummaryDataStore.h       |    2 +-
 .../Core/TUSummary/ExtractorRegistry.h        |   13 +
 .../TUSummaryExtractorFrontendAction.h        |   33 +
 .../SSAFBuiltinForceLinker.h                  |   28 +
 .../SSAFForceLinker.h                         |   25 +
 clang/include/clang/Sema/Template.h           |    5 +-
 .../StaticAnalyzer/Core/CheckerManager.h      |    9 +-
 .../Core/PathSensitive/CheckerContext.h       |    8 +-
 .../Core/PathSensitive/CoreEngine.h           |    1 -
 .../Core/PathSensitive/ExprEngine.h           |   17 +-
 clang/lib/AST/ASTContext.cpp                  |   16 +-
 clang/lib/AST/ByteCode/Compiler.cpp           |   52 +-
 clang/lib/AST/ByteCode/Disasm.cpp             |    1 -
 clang/lib/AST/ByteCode/EvalEmitter.h          |    4 -
 clang/lib/AST/ByteCode/Function.h             |    3 +-
 clang/lib/AST/ByteCode/FunctionPointer.cpp    |   36 -
 clang/lib/AST/ByteCode/FunctionPointer.h      |   55 -
 clang/lib/AST/ByteCode/Interp.cpp             |    3 +-
 clang/lib/AST/ByteCode/Interp.h               |    2 +-
 clang/lib/AST/ByteCode/MemberPointer.cpp      |    6 -
 clang/lib/AST/ByteCode/MemberPointer.h        |    2 -
 clang/lib/AST/ByteCode/Pointer.cpp            |   13 +-
 clang/lib/AST/ByteCode/Pointer.h              |   28 +-
 clang/lib/AST/ByteCode/PrimType.h             |    1 -
 clang/lib/AST/CMakeLists.txt                  |    1 -
 clang/lib/Basic/Targets/X86.cpp               |    8 +-
 clang/lib/Basic/Targets/X86.h                 |    1 +
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |    6 +-
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       |    6 +-
 .../lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp  |   30 +-
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          |   13 +-
 clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp   |   58 +-
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |   95 +-
 clang/lib/CIR/CodeGen/CIRGenFunction.h        |    6 +-
 clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp |   11 +-
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  155 +-
 .../Dialect/Transforms/CIRCanonicalize.cpp    |    3 +-
 .../CIR/Dialect/Transforms/CIRSimplify.cpp    |    7 +-
 .../Dialect/Transforms/LoweringPrepare.cpp    |   60 +-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  237 +--
 clang/lib/CodeGen/BackendUtil.cpp             |    1 +
 clang/lib/CodeGen/CGExpr.cpp                  |   11 +-
 clang/lib/CodeGen/CGExprCXX.cpp               |    6 +
 clang/lib/CodeGen/CGExprComplex.cpp           |  135 +-
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |    4 +-
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  375 +---
 clang/lib/CodeGen/CodeGenModule.cpp           |   58 +-
 clang/lib/CodeGen/CodeGenModule.h             |   12 +
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |    8 +-
 clang/lib/CodeGen/MicrosoftCXXABI.cpp         |    2 +-
 clang/lib/Driver/CMakeLists.txt               |    6 +
 clang/lib/Driver/Driver.cpp                   |   19 +
 clang/lib/Driver/SanitizerArgs.cpp            |   18 +-
 clang/lib/Driver/ToolChains/Arch/X86.cpp      |    6 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |    3 +
 clang/lib/Driver/ToolChains/Darwin.cpp        |   32 +-
 clang/lib/Format/BreakableToken.cpp           |   68 +-
 clang/lib/Format/ContinuationIndenter.cpp     |    3 +-
 clang/lib/Format/Format.cpp                   |  127 +-
 clang/lib/Format/TokenAnnotator.cpp           |   31 +-
 clang/lib/Format/UnwrappedLineFormatter.cpp   |  143 +-
 clang/lib/Format/UnwrappedLineParser.cpp      |   42 +-
 clang/lib/Format/WhitespaceManager.cpp        |  130 +-
 clang/lib/Format/WhitespaceManager.h          |    2 +
 clang/lib/FrontendTool/CMakeLists.txt         |    2 +
 .../ExecuteCompilerInvocation.cpp             |    6 +
 clang/lib/Parse/ParseOpenMP.cpp               |   60 +-
 .../CMakeLists.txt                            |    1 +
 .../JSONFormat/JSONFormatImpl.cpp             |    7 +-
 .../SerializationFormatRegistry.cpp           |    6 +
 .../Core/SummaryData/LUSummaryConsumer.cpp    |    2 +-
 .../Core/TUSummary/ExtractorRegistry.cpp      |    6 +
 .../Frontend/CMakeLists.txt                   |   14 +
 .../TUSummaryExtractorFrontendAction.cpp      |  181 ++
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  216 +-
 clang/lib/Sema/SemaAttr.cpp                   |    2 +
 clang/lib/Sema/SemaConcept.cpp                |   19 +-
 clang/lib/Sema/SemaDecl.cpp                   |   20 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |   34 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |   99 +-
 clang/lib/Sema/SemaExprCXX.cpp                |   36 +-
 clang/lib/Sema/SemaLifetimeSafety.h           |  238 +++
 clang/lib/Sema/SemaOpenMP.cpp                 |   29 +-
 clang/lib/Sema/SemaOverload.cpp               |    4 +-
 clang/lib/Sema/SemaTemplate.cpp               |    5 +-
 clang/lib/Sema/SemaTemplateDeduction.cpp      |    7 +-
 .../StaticAnalyzer/Core/BugSuppression.cpp    |   78 +-
 clang/lib/StaticAnalyzer/Core/CallEvent.cpp   |    6 +-
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  |   11 +-
 .../lib/StaticAnalyzer/Core/ExprEngineCXX.cpp |   14 +-
 .../Core/ExprEngineCallAndReturn.cpp          |  137 +-
 clang/test/AST/ByteCode/cxx20.cpp             |   27 +
 .../end-primitive-array-root-lifetime.cpp     |    2 +-
 clang/test/AST/ByteCode/loops.cpp             |   16 +-
 clang/test/AST/dump.cpp                       |   27 +-
 .../Scalable/command-line-interface.cpp       |   22 +
 .../Analysis/Scalable/downgradable-errors.cpp |   15 +
 clang/test/Analysis/Scalable/help.cpp         |   21 +
 .../Analysis/Scalable/ssaf-format/list.test   |    2 +-
 .../class-template-specializations.cpp        |  348 ++++
 .../test/Analysis/clang-suppress/classes.cpp  |   75 +
 .../clang-suppress/diagnostic-identifiers.cpp |  115 ++
 .../test/Analysis/clang-suppress/friends.cpp  |  366 ++++
 .../clang-suppress/function-templates.cpp     |   93 +
 .../test/Analysis/clang-suppress/lambdas.cpp  |  238 +++
 clang/test/Analysis/clang-suppress/macros.cpp |  186 ++
 .../Analysis/clang-suppress/namespaces.cpp    |   35 +
 .../clang-suppress/nested-templates.cpp       |  340 ++++
 .../Analysis/clang-suppress/statements.cpp    |  158 ++
 .../clang-suppress/template-methods.cpp       |  138 ++
 clang/test/Analysis/suppression-attr.cpp      |   91 -
 clang/test/C/C2y/n3517.c                      |   46 +
 clang/test/C/C2y/n3652.c                      |   19 +
 clang/test/C/C2y/n3715.c                      |   17 +
 clang/test/CIR/CodeGen/atomic.c               |   28 +-
 clang/test/CIR/CodeGen/binop.cpp              |   46 +-
 clang/test/CIR/CodeGen/bitfields.c            |    2 +-
 clang/test/CIR/CodeGen/complex-builtins.cpp   |    2 +-
 clang/test/CIR/CodeGen/complex-unary.cpp      |   76 +-
 clang/test/CIR/CodeGen/complex.cpp            |   20 +-
 clang/test/CIR/CodeGen/coro-task.cpp          |    3 +-
 .../CodeGen/cxx-rewritten-binary-operator.cpp |    2 +-
 clang/test/CIR/CodeGen/dynamic-cast-exact.cpp |    2 +-
 clang/test/CIR/CodeGen/fold-during-cg.c       |   70 +-
 clang/test/CIR/CodeGen/integer-overflow.c     |   28 +-
 clang/test/CIR/CodeGen/label.c                |    2 +-
 .../CIR/CodeGen/lambda-static-invoker.cpp     |    4 +-
 clang/test/CIR/CodeGen/new.cpp                |    9 +-
 clang/test/CIR/CodeGen/nrvo.cpp               |    2 +-
 clang/test/CIR/CodeGen/pointers.cpp           |    2 +-
 clang/test/CIR/CodeGen/size-of-vla.cpp        |    4 +-
 clang/test/CIR/CodeGen/static-vars.c          |    2 +-
 clang/test/CIR/CodeGen/static-vars.cpp        |    2 +-
 clang/test/CIR/CodeGen/throws.cpp             |    4 +-
 clang/test/CIR/CodeGen/try-catch.cpp          |    4 +-
 clang/test/CIR/CodeGen/unary.cpp              |   62 +-
 clang/test/CIR/CodeGen/vector-ext.cpp         |    7 +-
 clang/test/CIR/CodeGen/vector.cpp             |    7 +-
 clang/test/CIR/CodeGen/vla.c                  |    6 +-
 .../CodeGenBuiltins/X86/avx512bw-builtins.c   |   12 +-
 .../CodeGenBuiltins/X86/avx512dq-builtins.c   |    6 +-
 .../CodeGenBuiltins/X86/avx512f-builtins.c    |    6 +-
 .../CIR/CodeGenBuiltins/builtin-fcmp-sse.c    |    8 +-
 .../builtin-types-compatible.c                |    2 +-
 .../CIR/CodeGenOpenACC/atomic-capture.cpp     |   24 +-
 .../test/CIR/CodeGenOpenACC/atomic-update.cpp |    6 +-
 .../combined-firstprivate-clause.cpp          |   14 +-
 .../combined-private-clause.cpp               |    4 +-
 .../combined-reduction-clause-default-ops.cpp |   54 +-
 .../combined-reduction-clause-float.cpp       |   36 +-
 .../combined-reduction-clause-inline-ops.cpp  |   72 +-
 .../combined-reduction-clause-int.cpp         |   54 +-
 .../combined-reduction-clause-outline-ops.cpp |   72 +-
 .../compute-firstprivate-clause.c             |    6 +-
 .../compute-firstprivate-clause.cpp           |   14 +-
 .../CodeGenOpenACC/compute-private-clause.cpp |    4 +-
 .../compute-reduction-clause-default-ops.c    |   54 +-
 .../compute-reduction-clause-default-ops.cpp  |   54 +-
 .../compute-reduction-clause-float.c          |   36 +-
 .../compute-reduction-clause-float.cpp        |   36 +-
 .../compute-reduction-clause-inline-ops.cpp   |   72 +-
 .../compute-reduction-clause-int.c            |   54 +-
 .../compute-reduction-clause-int.cpp          |   54 +-
 .../compute-reduction-clause-outline-ops.cpp  |   72 +-
 .../compute-reduction-clause-unsigned-int.c   |   54 +-
 clang/test/CIR/CodeGenOpenACC/data.c          |    4 +-
 .../firstprivate-clause-recipes.cpp           |   46 +-
 .../CodeGenOpenACC/loop-private-clause.cpp    |    4 +-
 .../loop-reduction-clause-default-ops.cpp     |   54 +-
 .../loop-reduction-clause-float.cpp           |   36 +-
 .../loop-reduction-clause-inline-ops.cpp      |   72 +-
 .../loop-reduction-clause-int.cpp             |   54 +-
 .../loop-reduction-clause-outline-ops.cpp     |   72 +-
 .../private-clause-array-recipes-CtorDtor.cpp |   32 +-
 .../private-clause-array-recipes-NoOps.cpp    |   16 +-
 ...-clause-pointer-array-recipes-CtorDtor.cpp |  124 +-
 ...ate-clause-pointer-array-recipes-NoOps.cpp |   82 +-
 ...ivate-clause-pointer-array-recipes-int.cpp |   40 +-
 ...rivate-clause-pointer-recipes-CtorDtor.cpp |   44 +-
 .../private-clause-pointer-recipes-NoOps.cpp  |   32 +-
 .../private-clause-pointer-recipes-int.cpp    |   20 +-
 .../reduction-clause-recipes.cpp              |   64 +-
 clang/test/CIR/IR/unary.cir                   |   36 +-
 clang/test/CIR/Lowering/binop-int-vector.cir  |   24 +
 clang/test/CIR/Lowering/binop-signed-int.cir  |    2 +
 .../test/CIR/Lowering/binop-unsigned-int.cir  |    3 +
 clang/test/CIR/Transforms/binop-traits.cir    |   49 +
 clang/test/CIR/Transforms/canonicalize.cir    |  102 +-
 .../Transforms/flatten-cleanup-scope-eh.cir   |    6 +-
 .../flatten-cleanup-scope-multi-exit.cir      |    4 +-
 .../CIR/Transforms/max-min-idempotent.cir     |   77 +
 clang/test/CIR/Transforms/select.cir          |    2 +-
 clang/test/CMakeLists.txt                     |    1 +
 clang/test/CXX/drs/cwg8xx.cpp                 |    4 +-
 .../expr.prim.req/nested-requirement.cpp      |    2 +-
 .../CodeGen/AArch64/bf16-getset-intrinsics.c  |   18 -
 clang/test/CodeGen/AArch64/neon-intrinsics.c  |   40 -
 clang/test/CodeGen/AArch64/neon/bf16-getset.c |   36 +
 clang/test/CodeGen/AArch64/neon/fullfp16.c    |    6 +-
 clang/test/CodeGen/AArch64/neon/intrinsics.c  |   54 +-
 ...t_instantiation.exclude_from_dllexport.cpp |   12 +-
 ...t_instantiation.exclude_from_dllimport.cpp |   14 +-
 .../CodeGenCXX/dllexport-inherited-ctor.cpp   |   82 +-
 ...rosoft-vector-deleting-dtors-new-array.cpp |  122 ++
 .../resources/res-array-global-unbounded.hlsl |   12 +-
 clang/test/CodeGenSPIRV/global-dtor.cpp       |    7 +
 clang/test/Driver/autocomplete.c              |    1 +
 clang/test/Driver/cl-x86-flags.c              |    8 +-
 .../darwin-ld-platform-version-macos-nosdk.c  |   17 +
 .../Driver/darwin-ld-platform-version-macos.c |   13 -
 .../riscv-sifive-x160.c                       |   59 +
 .../riscv-sifive-x180.c                       |   71 +
 .../riscv-spacemit-x100.c                     |   29 +-
 .../riscv-spacemit-x60.c                      |   70 +
 clang/test/Driver/riscv-cpus.c                |   69 +-
 clang/test/Driver/x86-target-features.c       |    6 +-
 clang/test/Driver/xcselect.c                  |    5 +
 .../test/Misc/target-invalid-cpu-note/riscv.c |    4 +
 clang/test/Modules/pr170099.cppm              |   20 +
 clang/test/Modules/pr186603.cppm              |   22 +
 .../test/OpenMP/declare_target_ast_print.cpp  |   50 +-
 clang/test/OpenMP/declare_target_messages.cpp |  139 +-
 .../Preprocessor/predefined-arch-macros.c     |    4 +
 clang/test/Preprocessor/x86_target_features.c |    4 +-
 clang/test/Sema/Inputs/lifetime-analysis.h    |    3 +
 .../Sema/warn-lifetime-analysis-nocfg.cpp     |    5 +
 ..._explicit_instantiation.ignore-dllattr.cpp |   88 +-
 clang/test/SemaCXX/cxx2b-deducing-this.cpp    |    8 +-
 .../SemaCXX/deleted-template-spec-diag.cpp    |   12 +
 .../dllexport-constrained-inherited-ctor.cpp  |   40 +
 clang/test/SemaCXX/gh134265.cpp               |   31 +-
 clang/test/SemaTemplate/GH176152.cpp          |   12 +
 .../SemaTemplate/concepts-recursive-inst.cpp  |    4 +-
 clang/test/SemaTemplate/concepts.cpp          |   46 +-
 clang/test/SemaTemplate/deduction-crash.cpp   |    5 +
 clang/test/lit.cfg.py                         |    2 +
 clang/test/lit.site.cfg.py.in                 |    1 +
 clang/tools/CMakeLists.txt                    |    4 +-
 .../CMakeLists.txt                            |    0
 .../SSAFFormat.cpp                            |   10 +-
 .../CMakeLists.txt                            |    0
 .../SSAFLinker.cpp                            |   13 +-
 clang/unittests/AST/ByteCode/toAPValue.cpp    |    3 +-
 clang/unittests/Format/ConfigParseTest.cpp    |   44 +-
 .../Format/DefinitionBlockSeparatorTest.cpp   |    4 +-
 clang/unittests/Format/FormatTest.cpp         |  358 +++-
 clang/unittests/Format/FormatTestCSharp.cpp   |    3 +-
 clang/unittests/Format/FormatTestJS.cpp       |   17 +-
 clang/unittests/Format/FormatTestJava.cpp     |    3 +-
 .../Format/FormatTestMacroExpansion.cpp       |   16 +-
 .../unittests/Format/SortImportsTestJava.cpp  |   31 +
 clang/unittests/Format/TokenAnnotatorTest.cpp |    6 +
 .../CMakeLists.txt                            |    2 +
 .../TUSummaryExtractorFrontendActionTest.cpp  |  366 ++++
 .../Registries/FancyAnalysisData.cpp          |    2 +
 .../Registries/MockSerializationFormat.cpp    |    2 +
 .../Registries/MockSummaryExtractor1.cpp      |    6 +-
 .../Registries/MockSummaryExtractor2.cpp      |    6 +-
 .../SummaryExtractorRegistryTest.cpp          |    1 +
 .../SSAFBuiltinTestForceLinker.h              |   51 +
 .../SSAFTestForceLinker.h                     |   23 +
 .../TestFixture.cpp                           |    1 +
 clang/www/c_status.html                       |   57 +
 .../sanitizer_internal_defs.h                 |   13 +-
 flang/include/flang/Lower/CUDA.h              |    8 -
 .../flang/Semantics/openmp-directive-sets.h   |    2 +
 flang/include/flang/Semantics/openmp-utils.h  |   93 +-
 .../include/flang/Support/Fortran-features.h  |    3 +-
 flang/lib/Lower/CUDA.cpp                      |   18 -
 flang/lib/Lower/ConvertVariable.cpp           |   21 +-
 flang/lib/Lower/OpenMP/OpenMP.cpp             |   14 +
 .../Lower/Support/PrivateReductionUtils.cpp   |   80 +-
 .../Transforms/ACCUseDeviceCanonicalizer.cpp  |   13 +-
 flang/lib/Parser/openmp-parsers.cpp           |    3 +-
 flang/lib/Parser/program-parsers.cpp          |   15 +-
 flang/lib/Parser/stmt-parser.h                |    4 +-
 flang/lib/Semantics/check-omp-loop.cpp        |  134 +-
 flang/lib/Semantics/check-omp-structure.cpp   |    1 -
 flang/lib/Semantics/check-omp-structure.h     |    1 -
 flang/lib/Semantics/openmp-utils.cpp          |  369 +++-
 flang/lib/Semantics/resolve-directives.cpp    |   74 +-
 .../Fir/OpenACC/use-device-canonicalizer.mlir |   33 +
 flang/test/Lower/Intrinsics/iall.f90          |  227 ++-
 flang/test/Lower/Intrinsics/iand.f90          |   90 +-
 flang/test/Lower/Intrinsics/iany.f90          |  145 +-
 flang/test/Lower/Intrinsics/ibclr.f90         |   32 +-
 flang/test/Lower/Intrinsics/ibits.f90         |   42 +-
 flang/test/Lower/Intrinsics/ibset.f90         |   28 +-
 flang/test/Lower/Intrinsics/ichar.f90         |   49 +-
 flang/test/Lower/Intrinsics/ieee_class.f90    |  106 +-
 .../test/Lower/Intrinsics/ieee_copy_sign.f90  |   47 +-
 .../test/Lower/Intrinsics/ieee_is_finite.f90  |   72 +-
 .../Lower/Intrinsics/ieee_operator_eq.f90     |   43 +-
 flang/test/Lower/Intrinsics/ieee_signbit.f90  |   33 +-
 flang/test/Lower/Intrinsics/index.f90         |   75 +-
 flang/test/Lower/Intrinsics/iparity.f90       |   58 +-
 flang/test/Lower/Intrinsics/is_contiguous.f90 |   21 +-
 .../target-private-adjustable-array.f90       |   81 +
 flang/test/Lower/OpenMP/Todo/interchange.f90  |   15 +
 ...elayed-privatization-cuda-device-array.cuf |   31 -
 .../PowerPC/ppc-vec-store-elem-order.f90      |   20 +-
 flang/test/Lower/PowerPC/ppc-vec-store.f90    |  120 +-
 flang/test/Parser/OpenMP/do-interchange.f90   |   34 +
 flang/test/Parser/OpenMP/interchange-fail.f90 |   31 +
 .../Parser/OpenMP/interchange-permutation.f90 |   35 +
 flang/test/Parser/OpenMP/interchange.f90      |   30 +
 .../test/Parser/shared-line-program-units.f90 |   51 +
 .../shared-line-program-units.reject.0.f90    |    4 +
 .../shared-line-program-units.reject.1.f90    |    4 +
 flang/test/Semantics/OpenMP/do-collapse.f90   |   11 +-
 .../OpenMP/do-concurrent-collapse.f90         |    4 +
 flang/test/Semantics/OpenMP/do-ordered.f90    |   15 +-
 flang/test/Semantics/OpenMP/do08.f90          |   18 +-
 flang/test/Semantics/OpenMP/do10.f90          |    3 +-
 flang/test/Semantics/OpenMP/do13.f90          |   18 +-
 flang/test/Semantics/OpenMP/do15.f90          |   18 +-
 flang/test/Semantics/OpenMP/do16.f90          |    6 +-
 flang/test/Semantics/OpenMP/do22.f90          |    6 +-
 flang/test/Semantics/OpenMP/fuse1.f90         |    3 +-
 .../OpenMP/interchange-permutation.f90        |  109 +
 flang/test/Semantics/OpenMP/interchange01.f90 |   44 +
 .../OpenMP/loop-transformation-clauses01.f90  |    3 +-
 flang/test/Semantics/OpenMP/tile05.f90        |    3 +-
 flang/test/Semantics/OpenMP/tile07.f90        |    6 +-
 .../Semantics/negate-literal-typedexpr.f90    |   20 +
 flang/test/Semantics/test_errors.py           |   52 +-
 libc/Maintainers.rst                          |   11 +
 libc/docs/contributing.rst                    |    9 +-
 libc/docs/dev/clang_tidy_checks.rst           |   47 +-
 libc/docs/dev/code_style.rst                  |    4 -
 libc/fuzzing/__support/freelist_heap_fuzz.cpp |    1 +
 libc/fuzzing/string/strlen_fuzz.cpp           |    4 +-
 libc/hdr/func/aligned_alloc.h                 |    4 +
 libc/hdr/stdint_proxy.h                       |    4 +
 libc/shared/math.h                            |    3 +
 libc/shared/math/atanpif16.h                  |   29 +
 libc/shared/math/bf16fma.h                    |   23 +
 libc/shared/math/log_bf16.h                   |   23 +
 libc/src/__support/FPUtil/dyadic_float.h      |    1 +
 libc/src/__support/math/CMakeLists.txt        |   42 +
 libc/src/__support/math/atanpif16.h           |  182 ++
 libc/src/__support/math/bf16fma.h             |   26 +
 libc/src/__support/math/log_bf16.h            |  145 ++
 libc/src/math/generic/CMakeLists.txt          |   25 +-
 libc/src/math/generic/atanpif16.cpp           |  157 +-
 libc/src/math/generic/bf16fma.cpp             |    7 +-
 libc/src/math/generic/log_bf16.cpp            |  123 +-
 libc/test/shared/CMakeLists.txt               |    3 +
 libc/test/shared/shared_math_test.cpp         |    6 +
 libc/test/src/math/smoke/CMakeLists.txt       |    1 +
 libc/test/src/math/smoke/atanpif16_test.cpp   |    7 +
 libclc/CMakeLists.txt                         |    3 +-
 libclc/clc/include/clc/math/clc_ep_decl.inc   |   11 +-
 libclc/clc/include/clc/math/clc_exp2_fast.h   |   19 +
 libclc/clc/include/clc/math/clc_log2_fast.h   |   19 +
 libclc/clc/include/clc/math/clc_pow.h         |    7 +-
 libclc/clc/include/clc/math/clc_pown.h        |    9 +-
 libclc/clc/include/clc/math/clc_powr.h        |    9 +-
 libclc/clc/include/clc/math/clc_rootn.h       |    9 +-
 .../clc/shared/binary_def_scalarize_loop.inc  |   59 +
 libclc/clc/lib/amdgpu/CMakeLists.txt          |    2 +
 libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl   |   33 +
 libclc/clc/lib/amdgpu/math/clc_log2_fast.cl   |   25 +
 libclc/clc/lib/generic/CMakeLists.txt         |    3 +
 libclc/clc/lib/generic/math/clc_cos.inc       |   13 +-
 libclc/clc/lib/generic/math/clc_ep.cl         |   20 +-
 libclc/clc/lib/generic/math/clc_ep.inc        |  137 +-
 libclc/clc/lib/generic/math/clc_exp2_fast.cl  |   15 +
 libclc/clc/lib/generic/math/clc_log2_fast.cl  |   15 +
 libclc/clc/lib/generic/math/clc_log_base.h    |    5 +-
 libclc/clc/lib/generic/math/clc_pow.cl        |   43 +-
 libclc/clc/lib/generic/math/clc_pow.inc       |  438 ----
 libclc/clc/lib/generic/math/clc_pow_base.inc  |  542 +++++
 libclc/clc/lib/generic/math/clc_pown.cl       |   42 +-
 libclc/clc/lib/generic/math/clc_pown.inc      |  402 ----
 libclc/clc/lib/generic/math/clc_powr.cl       |   44 +-
 libclc/clc/lib/generic/math/clc_powr.inc      |  414 ----
 libclc/clc/lib/generic/math/clc_rootn.cl      |   41 +-
 libclc/clc/lib/generic/math/clc_rootn.inc     |  405 ----
 libclc/clc/lib/generic/math/clc_sin.inc       |   18 +-
 .../lib/generic/math/clc_sincos_helpers.cl    |   13 +-
 .../lib/generic/math/clc_sincos_helpers.inc   |  101 +-
 .../generic/math/clc_sincos_helpers_fp64.inc  |    2 +
 libclc/clc/lib/generic/math/clc_tan.inc       |   14 +-
 .../mem_fence/clc_mem_fence.cl                |    4 +-
 libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt    |    1 -
 .../cmake/modules/CMakeCLCInformation.cmake   |   24 +-
 .../modules/CMakeDetermineCLCCompiler.cmake   |   17 +-
 .../cmake/modules/CMakeTestCLCCompiler.cmake  |    4 +-
 libclc/opencl/lib/generic/math/pow.cl         |   14 +-
 libclc/opencl/lib/generic/math/pown.cl        |   12 +-
 libclc/opencl/lib/generic/math/powr.cl        |   12 +-
 libclc/opencl/lib/generic/math/rootn.cl       |   12 +-
 libclc/test/CMakeLists.txt                    |    2 +-
 libcxx/CMakeLists.txt                         |   18 +-
 .../__format/formatter_floating_point.h       |    8 +-
 libcxx/include/__functional/operations.h      |    8 +-
 .../include/__type_traits/make_transparent.h  |   21 +-
 libcxx/include/format                         |    5 +-
 .../test/libcxx/transitive_includes/cxx26.csv |    5 -
 .../compare_exchange_weak.pass.cpp            |   20 +-
 .../tools/clang_tidy_checks/CMakeLists.txt    |    2 -
 libcxx/utils/ci/docker/docker-compose.yml     |    4 +-
 libcxx/utils/ci/lnt/runners/README.md         |    7 +
 libcxx/utils/ci/lnt/runners/apple-m5-clang21  |   31 +
 libcxx/utils/ci/lnt/runners/apple-m5-xcode26  |   29 +
 lld/ELF/Arch/LoongArch.cpp                    |    5 +-
 lld/ELF/Arch/RISCV.cpp                        |    9 +-
 .../ELF/loongarch-relax-synthetic-in-text.s   |   31 +
 lld/test/ELF/riscv-relax-synthetic-in-text.s  |   33 +
 lldb/cmake/modules/LLDBConfig.cmake           |   24 +-
 lldb/examples/darwin/heap_find/heap.py        |   22 +-
 lldb/include/lldb/Core/Module.h               |    3 +-
 lldb/include/lldb/Core/ModuleList.h           |    1 -
 lldb/include/lldb/Host/Config.h.cmake         |    2 +
 lldb/include/lldb/Target/Target.h             |    5 +-
 lldb/include/lldb/ValueObject/DILEval.h       |    7 +
 .../Python/lldbsuite/test/configuration.py    |    3 +
 .../Python/lldbsuite/test/decorators.py       |   41 +
 lldb/packages/Python/lldbsuite/test/dotest.py |    3 +
 .../Python/lldbsuite/test/dotest_args.py      |    6 +
 .../Python/lldbsuite/test/lldbplatformutil.py |   16 +
 lldb/source/Commands/CommandObjectTarget.cpp  |    6 +-
 lldb/source/Core/Debugger.cpp                 |    5 +-
 lldb/source/Core/Module.cpp                   |   19 +-
 lldb/source/Core/ModuleList.cpp               |    4 +-
 .../ObjectFile/PECOFF/ObjectFilePECOFF.cpp    |   21 +
 .../ObjectFile/PECOFF/ObjectFilePECOFF.h      |    2 +
 .../Platform/MacOSX/PlatformDarwin.cpp        |   78 +-
 .../Python/ScriptInterpreterPython.cpp        |    9 +
 .../NativePDB/SymbolFileNativePDB.cpp         |    7 +-
 .../Plugins/SymbolFile/PDB/SymbolFilePDB.cpp  |    3 +-
 .../Plugins/SymbolLocator/CMakeLists.txt      |    1 +
 .../SymbolLocator/SymStore/CMakeLists.txt     |   20 +
 .../SymStore/SymbolLocatorSymStore.cpp        |  147 ++
 .../SymStore/SymbolLocatorSymStore.h          |   50 +
 .../SymbolLocatorSymStoreProperties.td        |    7 +
 .../PECOFF/SymbolVendorPECOFF.cpp             |   58 +-
 lldb/source/Target/Target.cpp                 |    7 +-
 lldb/source/ValueObject/DILEval.cpp           |  122 +-
 .../Arithmetic/TestFrameVarDILArithmetic.py   |   12 -
 .../TestFrameVarDILExprPointerArithmetic.py   |  131 ++
 .../var-dil/expr/PointerArithmetic/main.cpp   |   20 +
 .../data-formatter/synthetic_subscript/main.c |    4 +-
 .../functionalities/ptr_refs/TestPtrRefs.py   |    1 +
 .../TestCppGlobalOperators.py                 |    1 +
 .../API/lang/objc/ptr_refs/TestPtrRefsObjC.py |    1 +
 lldb/test/API/lit.cfg.py                      |    3 +
 lldb/test/API/lit.site.cfg.py.in              |    1 +
 lldb/test/API/lua_api/TestLuaAPI.py           |    1 +
 lldb/test/API/macosx/mte/TestDarwinMTE.py     |   11 +-
 lldb/test/API/symstore/Makefile               |    2 +
 lldb/test/API/symstore/TestSymStoreLocal.py   |  123 ++
 lldb/test/API/symstore/main.c                 |    5 +
 .../lldb-dap/variables/TestDAP_variables.py   |    2 +-
 .../lldb-server/TestAppleSimulatorOSType.py   |    1 +
 .../tools/lldb-server/TestGdbRemoteAttach.py  |    1 +
 .../lldb-server/TestGdbRemoteProcessInfo.py   |    1 +
 .../lldb-server/TestGdbRemoteRegisterState.py |    1 +
 .../TestGdbRemoteThreadsInStopReply.py        |    1 +
 .../TestGdbRemote_qMemoryRegion.py            |    1 +
 .../tools/lldb-server/TestLldbGdbServer.py    |    1 +
 .../attach-wait/TestGdbRemoteAttachWait.py    |    1 +
 .../launch/replace-dll/TestReplaceDLL.py      |    2 +
 lldb/test/CMakeLists.txt                      |    1 +
 lldb/test/Shell/Diagnostics/TestDump.test     |    6 +-
 lldb/test/Shell/Heap/heap-cstr.test           |    2 +
 .../dsym-python-script-name-warnings.test     |   39 +
 .../NativePDB/structured-bindings-msvc.test   |    2 +-
 .../Shell/SymbolFile/NativePDB/vbases.test    |    8 +-
 .../dependent-modules-nodupe-windows.test     |    5 +-
 lldb/test/Shell/lit.cfg.py                    |    3 +
 lldb/test/Shell/lit.site.cfg.py.in            |    1 +
 lldb/tools/driver/CMakeLists.txt              |   18 +-
 lldb/tools/driver/lldb-mte-entitlements.plist |   10 +
 lldb/tools/lldb-dap/ProtocolUtils.cpp         |    3 +-
 lldb/tools/lldb-server/lldb-gdbserver.cpp     |   36 +-
 lldb/tools/lldb-server/lldb-platform.cpp      |  115 +-
 lldb/unittests/Platform/CMakeLists.txt        |    1 +
 .../unittests/Platform/PlatformDarwinTest.cpp |  142 +-
 lldb/unittests/Platform/TestUtils.cpp         |   42 +
 lldb/unittests/Platform/TestUtils.h           |   59 +
 llvm/CMakeLists.txt                           |    5 +
 llvm/cmake/config-ix.cmake                    |   21 +-
 llvm/cmake/modules/AddLLVM.cmake              |   22 +-
 llvm/docs/CommandGuide/llvm-link.rst          |    5 +-
 llvm/docs/DirectX/DXILResources.rst           |    2 +-
 llvm/docs/Frontend/PerformanceTips.rst        |   13 +
 llvm/docs/LangRef.rst                         |  151 +-
 llvm/docs/ReleaseNotes.md                     |    9 +
 llvm/docs/SPIRVUsage.rst                      |    9 +
 llvm/docs/TestSuiteGuide.md                   |    2 +-
 llvm/include/llvm-c/Core.h                    |   68 +
 llvm/include/llvm/ADT/Repeated.h              |  120 ++
 .../llvm/Analysis/BranchProbabilityInfo.h     |  241 +--
 llvm/include/llvm/Analysis/CFGPrinter.h       |    6 +-
 llvm/include/llvm/Analysis/DXILResource.h     |   11 +-
 .../llvm/Analysis/DependenceAnalysis.h        |    4 +-
 llvm/include/llvm/Analysis/IR2Vec.h           |    8 +-
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |    4 +
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |    7 +
 .../include/llvm/CodeGen/MachineInstrBundle.h |   14 +
 llvm/include/llvm/CodeGen/Passes.h            |    4 +-
 llvm/include/llvm/Config/config.h.cmake       |    3 +
 llvm/include/llvm/Frontend/OpenMP/OMP.td      |    3 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |  132 +-
 llvm/include/llvm/IR/CFG.h                    |  138 +-
 llvm/include/llvm/IR/Constants.h              |  193 +-
 llvm/include/llvm/IR/DataLayout.h             |   11 +
 llvm/include/llvm/IR/DerivedTypes.h           |   44 +
 llvm/include/llvm/IR/Dominators.h             |    7 +-
 llvm/include/llvm/IR/IRBuilder.h              |   37 +-
 llvm/include/llvm/IR/InlineAsm.h              |    8 +-
 llvm/include/llvm/IR/InstrTypes.h             |    8 +-
 llvm/include/llvm/IR/Instruction.h            |   38 +
 llvm/include/llvm/IR/Instructions.h           |  187 +-
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |    2 +-
 llvm/include/llvm/IR/PatternMatch.h           |  103 +-
 llvm/include/llvm/IR/Type.h                   |   41 +-
 llvm/include/llvm/IR/Value.def                |    1 +
 llvm/include/llvm/InitializePasses.h          |    2 +-
 llvm/include/llvm/MC/MCPseudoProbe.h          |    8 +
 .../llvm/Passes/MachinePassRegistry.def       |    2 +-
 llvm/include/llvm/SandboxIR/Instruction.h     |    9 +-
 llvm/include/llvm/Support/GenericDomTree.h    |    8 +-
 .../llvm/TargetParser/X86TargetParser.def     |    1 +
 .../Transforms/IPO/FunctionSpecialization.h   |    2 +-
 llvm/include/llvm/Transforms/IPO/IROutliner.h |    3 +-
 llvm/include/llvm/Transforms/Scalar/GVN.h     |    4 +-
 .../llvm/Transforms/Scalar/JumpThreading.h    |    4 +-
 .../llvm/Transforms/Utils/BasicBlockUtils.h   |    6 +-
 llvm/include/llvm/Transforms/Utils/Cloning.h  |    9 +-
 .../llvm/Transforms/Utils/GuardUtils.h        |    6 +-
 .../llvm/Transforms/Utils/Instrumentation.h   |    1 +
 llvm/include/llvm/Transforms/Utils/Local.h    |    4 +-
 .../llvm/Transforms/Utils/LoopConstrainer.h   |    6 +-
 .../include/llvm/Transforms/Utils/LoopUtils.h |    9 +-
 llvm/lib/Analysis/BranchProbabilityInfo.cpp   |  495 +++--
 llvm/lib/Analysis/CFG.cpp                     |   59 +-
 llvm/lib/Analysis/DXILResource.cpp            |   12 +-
 llvm/lib/Analysis/DependenceAnalysis.cpp      |   15 +-
 llvm/lib/Analysis/ScalarEvolution.cpp         |   36 +-
 llvm/lib/Analysis/ValueTracking.cpp           |    7 +-
 llvm/lib/AsmParser/LLLexer.cpp                |   35 +-
 llvm/lib/AsmParser/LLParser.cpp               |   21 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |   54 +-
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   29 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |   64 +-
 llvm/lib/CodeGen/CodeGen.cpp                  |    2 +-
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |   24 +-
 llvm/lib/CodeGen/CommandFlags.cpp             |    6 +-
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       |   54 +-
 llvm/lib/CodeGen/MIRPrinter.cpp               |    6 +
 llvm/lib/CodeGen/MachineInstr.cpp             |    2 +-
 llvm/lib/CodeGen/MachineInstrBundle.cpp       |   66 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   10 -
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |    2 +
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |   21 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |    1 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |    8 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   14 +-
 .../SelectionDAG/SelectionDAGDumper.cpp       |    5 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   18 +-
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |    4 +
 llvm/lib/CodeGen/ValueTypes.cpp               |    4 +
 llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp    |    2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  324 ++-
 llvm/lib/FuzzMutate/RandomIRBuilder.cpp       |   16 +-
 llvm/lib/IR/AsmWriter.cpp                     |   26 +-
 llvm/lib/IR/Attributes.cpp                    |    4 +
 llvm/lib/IR/CMakeLists.txt                    |    1 +
 llvm/lib/IR/Constants.cpp                     |  309 ++-
 llvm/lib/IR/Core.cpp                          |   46 +-
 llvm/lib/IR/DataLayout.cpp                    |   18 +
 llvm/lib/IR/Dominators.cpp                    |   12 -
 llvm/lib/IR/Instruction.cpp                   |   21 +-
 llvm/lib/IR/Instructions.cpp                  |   36 +-
 llvm/lib/IR/Intrinsics.cpp                    |    3 +
 llvm/lib/IR/LLVMContextImpl.cpp               |    7 +-
 llvm/lib/IR/LLVMContextImpl.h                 |   11 +
 llvm/lib/IR/PatternMatch.cpp                  |   53 +
 llvm/lib/IR/Type.cpp                          |   73 +-
 llvm/lib/IR/Verifier.cpp                      |   19 +-
 llvm/lib/Passes/PassBuilder.cpp               |    1 +
 llvm/lib/Support/CMakeLists.txt               |    5 +-
 llvm/lib/Support/KnownFPClass.cpp             |    4 +-
 llvm/lib/Support/TextEncoding.cpp             |    4 +
 llvm/lib/Support/YAMLParser.cpp               |    4 +-
 .../AArch64/AArch64ConditionOptimizer.cpp     |   98 +-
 .../AArch64/AArch64ConditionalCompares.cpp    |   13 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  488 ++---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   57 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |    5 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |    2 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |    4 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |    5 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h   |    3 +-
 .../AMDGPUGlobalISelDivergenceLowering.cpp    |    8 +-
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    |   48 +
 .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp    |   86 +-
 .../AMDGPU/AMDGPURegBankLegalizeHelper.h      |    1 +
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |   57 +-
 .../AMDGPU/AMDGPURegBankLegalizeRules.h       |    5 +-
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |    2 +
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |    4 +-
 .../Target/AMDGPU/R600TargetTransformInfo.h   |    5 +-
 llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp   |  323 ++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |    3 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    |   47 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.h      |   12 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   16 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |    6 +
 llvm/lib/Target/ARM/ARMTargetMachine.cpp      |    2 +-
 llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp  |   53 +-
 .../Target/BPF/MCTargetDesc/BPFMCAsmInfo.h    |    2 +-
 .../lib/Target/DirectX/DXContainerGlobals.cpp |    4 +-
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |    5 +-
 llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp |    2 +-
 .../Hexagon/HexagonTargetObjectFile.cpp       |    1 +
 .../MCTargetDesc/LoongArchAsmBackend.cpp      |    2 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      |    2 +-
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp |    8 +-
 llvm/lib/Target/PowerPC/PPC.td                |   10 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   21 +-
 llvm/lib/Target/PowerPC/PPCInstrFuture.td     |   19 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.td       |    1 +
 llvm/lib/Target/PowerPC/PPCRegisterClasses.td |  101 +
 llvm/lib/Target/PowerPC/PPCRegisterInfo.td    |  425 ++--
 llvm/lib/Target/PowerPC/PPCRegisterInfoDMR.td |   49 +-
 llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td |   85 +-
 llvm/lib/Target/PowerPC/PPCScheduleP10.td     |    2 +-
 llvm/lib/Target/PowerPC/PPCScheduleP9.td      |    3 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   47 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   96 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoP.td      |    3 +
 llvm/lib/Target/RISCV/RISCVProcessors.td      |  102 +-
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |    2 +-
 .../SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp   |   25 +-
 .../SPIRV/MCTargetDesc/SPIRVInstPrinter.h     |    2 +
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp |   46 +-
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp |    9 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |   33 +-
 .../Target/SPIRV/SPIRVLegalizePointerCast.cpp |   74 +-
 .../SPIRV/SPIRVMergeRegionExitTargets.cpp     |   77 +-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp |    6 +-
 llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp   |   36 +-
 llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp      |   31 +
 llvm/lib/Target/SPIRV/SPIRVUtils.h            |    6 +-
 .../WebAssembly/WebAssemblyFastISel.cpp       |    6 +-
 .../WebAssembly/WebAssemblyISelDAGToDAG.cpp   |  105 +-
 .../WebAssembly/WebAssemblyISelLowering.cpp   |   98 +-
 .../WebAssembly/WebAssemblyInstrAtomics.td    |   47 +-
 .../X86/GISel/X86InstructionSelector.cpp      |    5 +-
 llvm/lib/Target/X86/X86.td                    |   12 +
 llvm/lib/Target/X86/X86AsmPrinter.cpp         |    5 +
 llvm/lib/Target/X86/X86FastISel.cpp           |   26 +-
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp       |  105 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  159 +-
 llvm/lib/Target/X86/X86InstrAVX512.td         |    6 +
 llvm/lib/Target/X86/X86InstrArithmetic.td     |   94 +-
 llvm/lib/Target/X86/X86InstrCMovSetCC.td      |   14 +-
 llvm/lib/Target/X86/X86InstrCompiler.td       |  192 +-
 llvm/lib/Target/X86/X86InstrFoldTables.cpp    |    8 +
 llvm/lib/Target/X86/X86InstrFoldTables.h      |    4 +
 llvm/lib/Target/X86/X86InstrFragments.td      |    6 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp          |   21 +-
 llvm/lib/Target/X86/X86InstrPredicates.td     |    3 +
 llvm/lib/Target/X86/X86InstrShiftRotate.td    |   42 +-
 llvm/lib/Target/X86/X86InstrUtils.td          |   20 +-
 llvm/lib/Target/X86/X86MCInstLower.cpp        |   13 +-
 llvm/lib/Target/X86/X86Subtarget.cpp          |    5 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp      |    2 +-
 llvm/lib/TargetParser/Host.cpp                |    1 +
 llvm/lib/TargetParser/X86TargetParser.cpp     |    9 +-
 .../AggressiveInstCombine.cpp                 |  188 +-
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp  |   20 +-
 llvm/lib/Transforms/IPO/Attributor.cpp        |    2 +-
 .../Transforms/IPO/AttributorAttributes.cpp   |   56 +-
 llvm/lib/Transforms/IPO/CrossDSOCFI.cpp       |    2 +-
 .../lib/Transforms/IPO/ForceFunctionAttrs.cpp |   47 +-
 .../Transforms/IPO/FunctionSpecialization.cpp |    6 +-
 llvm/lib/Transforms/IPO/IROutliner.cpp        |   54 +-
 llvm/lib/Transforms/IPO/LoopExtractor.cpp     |    6 +-
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp    |    4 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |   41 +-
 llvm/lib/Transforms/IPO/PartialInlining.cpp   |   10 +-
 llvm/lib/Transforms/IPO/SampleProfile.cpp     |    4 +-
 .../lib/Transforms/IPO/WholeProgramDevirt.cpp |    2 +-
 .../InstCombine/InstCombineAddSub.cpp         |   19 +-
 .../InstCombine/InstCombineCompares.cpp       |    8 +-
 .../InstCombineLoadStoreAlloca.cpp            |   12 +-
 .../Transforms/InstCombine/InstCombinePHI.cpp |    5 +-
 .../InstCombine/InstCombineVectorOps.cpp      |    4 +-
 .../InstCombine/InstructionCombining.cpp      |   23 +-
 .../Instrumentation/AddressSanitizer.cpp      |    5 +-
 .../Instrumentation/BoundsChecking.cpp        |    4 +-
 .../ControlHeightReduction.cpp                |   64 +-
 .../Instrumentation/DataFlowSanitizer.cpp     |    9 +-
 .../Instrumentation/HWAddressSanitizer.cpp    |    4 +-
 .../Instrumentation/MemorySanitizer.cpp       |   30 +-
 .../NumericalStabilitySanitizer.cpp           |   23 +-
 .../Instrumentation/PGOInstrumentation.cpp    |    6 +-
 .../Instrumentation/SanitizerCoverage.cpp     |   38 +-
 llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp   |    8 +-
 llvm/lib/Transforms/Scalar/ADCE.cpp           |   13 +-
 .../Transforms/Scalar/CallSiteSplitting.cpp   |    4 +-
 .../Scalar/ConstraintElimination.cpp          |    8 +-
 .../Transforms/Scalar/DFAJumpThreading.cpp    |   30 +-
 .../Scalar/DeadStoreElimination.cpp           |    4 +-
 llvm/lib/Transforms/Scalar/EarlyCSE.cpp       |    8 +-
 llvm/lib/Transforms/Scalar/GVN.cpp            |   10 +-
 llvm/lib/Transforms/Scalar/GVNSink.cpp        |    2 +-
 llvm/lib/Transforms/Scalar/GuardWidening.cpp  |    8 +-
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp |   43 +-
 .../Scalar/InductiveRangeCheckElimination.cpp |   10 +-
 .../Transforms/Scalar/JumpTableToSwitch.cpp   |    2 +-
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |  104 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |   21 +-
 llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp |   16 +-
 llvm/lib/Transforms/Scalar/LoopDistribute.cpp |    2 +-
 llvm/lib/Transforms/Scalar/LoopFlatten.cpp    |   25 +-
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       |   15 +-
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  |   77 +-
 .../lib/Transforms/Scalar/LoopInterchange.cpp |   52 +-
 .../lib/Transforms/Scalar/LoopPredication.cpp |   21 +-
 .../lib/Transforms/Scalar/LoopSimplifyCFG.cpp |    4 +-
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |   35 +-
 llvm/lib/Transforms/Scalar/LoopTermFold.cpp   |    6 +-
 llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp |   18 +-
 .../Scalar/LowerConstantIntrinsics.cpp        |    6 +-
 .../Scalar/LowerExpectIntrinsic.cpp           |   25 +-
 .../Scalar/LowerMatrixIntrinsics.cpp          |    4 +-
 llvm/lib/Transforms/Scalar/MergeICmps.cpp     |   14 +-
 .../Scalar/MergedLoadStoreMotion.cpp          |    4 +-
 .../Scalar/PartiallyInlineLibCalls.cpp        |    4 +-
 llvm/lib/Transforms/Scalar/Reassociate.cpp    |    2 +-
 .../Scalar/RewriteStatepointsForGC.cpp        |    5 +-
 llvm/lib/Transforms/Scalar/SROA.cpp           |    4 +-
 .../Scalar/SeparateConstOffsetFromGEP.cpp     |    2 +-
 .../Transforms/Scalar/SimpleLoopUnswitch.cpp  |   75 +-
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |    8 +-
 .../Scalar/SpeculativeExecution.cpp           |    6 +-
 llvm/lib/Transforms/Scalar/StructurizeCFG.cpp |  100 +-
 .../Scalar/TailRecursionElimination.cpp       |   11 +-
 .../lib/Transforms/Utils/AMDGPUEmitPrintf.cpp |    8 +-
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp |   67 +-
 .../Transforms/Utils/BreakCriticalEdges.cpp   |    2 +-
 llvm/lib/Transforms/Utils/CloneFunction.cpp   |   42 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   10 +-
 .../lib/Transforms/Utils/ControlFlowUtils.cpp |   36 +-
 llvm/lib/Transforms/Utils/Evaluator.cpp       |   17 +-
 llvm/lib/Transforms/Utils/FixIrreducible.cpp  |   22 +-
 llvm/lib/Transforms/Utils/FlattenCFG.cpp      |   46 +-
 llvm/lib/Transforms/Utils/GuardUtils.cpp      |    7 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |   29 +-
 llvm/lib/Transforms/Utils/Local.cpp           |   42 +-
 llvm/lib/Transforms/Utils/LoopConstrainer.cpp |   12 +-
 llvm/lib/Transforms/Utils/LoopPeel.cpp        |   12 +-
 .../Transforms/Utils/LoopRotationUtils.cpp    |   21 +-
 llvm/lib/Transforms/Utils/LoopSimplify.cpp    |   26 +-
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      |   18 +-
 .../lib/Transforms/Utils/LoopUnrollAndJam.cpp |   37 +-
 .../Transforms/Utils/LoopUnrollRuntime.cpp    |   18 +-
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   46 +-
 .../lib/Transforms/Utils/LowerGlobalDtors.cpp |    2 +-
 llvm/lib/Transforms/Utils/LowerInvoke.cpp     |    2 +-
 .../Transforms/Utils/LowerMemIntrinsics.cpp   |   18 +-
 llvm/lib/Transforms/Utils/LowerSwitch.cpp     |   10 +-
 llvm/lib/Transforms/Utils/MatrixUtils.cpp     |   10 +-
 llvm/lib/Transforms/Utils/MisExpect.cpp       |    2 +-
 llvm/lib/Transforms/Utils/PredicateInfo.cpp   |    8 +-
 llvm/lib/Transforms/Utils/ProfileVerify.cpp   |    2 +-
 llvm/lib/Transforms/Utils/SCCPSolver.cpp      |   15 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  277 ++-
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp  |   13 +-
 .../Utils/UnifyFunctionExitNodes.cpp          |    4 +-
 llvm/lib/Transforms/Utils/UnifyLoopExits.cpp  |   12 +-
 .../Vectorize/LoopIdiomVectorize.cpp          |   52 +-
 .../Vectorize/LoopVectorizationLegality.cpp   |   26 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  341 +---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   42 +-
 llvm/lib/Transforms/Vectorize/VPlan.cpp       |   26 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |   29 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |   30 +-
 .../Vectorize/VPlanConstruction.cpp           |   47 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |    9 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |   93 +-
 .../Transforms/Vectorize/VPlanTransforms.h    |    4 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.cpp  |    5 +-
 llvm/lib/Transforms/Vectorize/VPlanUtils.h    |   11 +-
 llvm/runtimes/CMakeLists.txt                  |    2 +-
 .../Analysis/CostModel/AArch64/arith-bf16.ll  |   52 +-
 .../weak-crossing-siv-delta-signed-min.ll     |   64 +
 .../Inputs/reference_default_vocab_print.txt  |    1 +
 .../Inputs/reference_wtd1_vocab_print.txt     |    1 +
 .../Inputs/reference_wtd2_vocab_print.txt     |    1 +
 .../invariant-dep-same-ptr.ll                 |  343 ++++
 .../ScalarEvolution/two-loop-latches.ll       |   58 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |   28 +
 .../Assembler/2008-02-18-IntPointerCrash.ll   |    2 +-
 llvm/test/Assembler/byte-invalid.ll           |   73 +
 llvm/test/Assembler/byte.ll                   |  101 +
 llvm/test/Assembler/invalid-inttype.ll        |    2 +-
 llvm/test/Bindings/llvm-c/byte.ll             |   11 +
 llvm/test/Bindings/llvm-c/echo.ll             |    1 +
 llvm/test/Bitcode/compatibility.ll            |    2 +
 .../AArch64/GlobalISel/legalize-shuffle-1x.ll |   73 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |    3 +
 .../AArch64/ccmp-look-through-copy.mir        |   45 +
 llvm/test/CodeGen/AArch64/imm-splat-ops.ll    |    4 +-
 llvm/test/CodeGen/AArch64/sve-asrd.ll         |   15 +-
 .../sve-fixed-length-no-vscale-range.ll       |   11 +
 .../AArch64/sve-partial-reduce-dot-product.ll |    4 +-
 .../AArch64/sve2p3-dots-partial-reduction.ll  |   52 +
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    |    2 +-
 .../AMDGPU/GlobalISel/atomicrmw-fmin-fmax.ll  |  596 ++++++
 .../AMDGPU/GlobalISel/atomicrmw_fmax.ll       |  146 +-
 .../AMDGPU/GlobalISel/atomicrmw_fmin.ll       |  146 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll  |   70 +-
 .../GlobalISel/llvm.amdgcn.strictwqm.ll       |  145 ++
 .../AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll      |   71 +-
 .../GlobalISel/regbankselect-amdgcn.wqm.mir   |   70 +-
 .../regbankselect-amdgpu-ffbh-u32.mir         |    8 +-
 .../AMDGPU/barrier-elimination-gfx12.ll       |    2 +-
 .../CodeGen/AMDGPU/barrier-elimination.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   |  130 +-
 llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll   |  223 +-
 llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir   |   24 +-
 .../AMDGPU/fp-min-max-buffer-atomics.ll       |   12 +-
 .../AMDGPU/fp-min-max-buffer-ptr-atomics.ll   |   10 +-
 .../AMDGPU/fp64-min-max-buffer-atomics.ll     |    8 +-
 .../AMDGPU/fp64-min-max-buffer-ptr-atomics.ll |    8 +-
 .../CodeGen/AMDGPU/implicit-arg-v5-opt.ll     |  183 ++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   12 +
 .../AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll   |  157 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll   |   25 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll   |   21 +
 .../AMDGPU/llvm.amdgcn.s.wait.event.ll        |    4 +-
 .../AMDGPU/llvm.amdgcn.s.wait.gfx12.ll        |    2 +-
 .../AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll      |    2 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll   |    6 +-
 .../CodeGen/AMDGPU/memintrinsic-unroll.ll     |    6 +-
 ...ubreg-undef-def-with-other-subreg-defs.mir |   24 +-
 .../AMDGPU/wmma-coexecution-valu-hazards.mir  |   26 +
 llvm/test/CodeGen/AMDGPU/write_register.ll    |    2 +-
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |    3 +
 ...st-and-by-const-from-lshr-in-eqcmp-zero.ll |   49 +-
 ...ist-and-by-const-from-shl-in-eqcmp-zero.ll |   27 +-
 llvm/test/CodeGen/AVR/calling-conv/c/tiny.ll  |   14 +-
 llvm/test/CodeGen/AVR/pseudo/SPREAD.mir       |   28 +
 llvm/test/CodeGen/AVR/pseudo/SPWRITE.mir      |    7 +-
 llvm/test/CodeGen/AVR/return.ll               |    8 +-
 llvm/test/CodeGen/BPF/cttz-ctlz.ll            |   16 +-
 llvm/test/CodeGen/BPF/gotol.ll                |   12 +-
 llvm/test/CodeGen/BPF/jump_table_blockaddr.ll |   16 +-
 .../test/CodeGen/BPF/jump_table_global_var.ll |   12 +-
 .../CodeGen/BPF/jump_table_switch_stmt.ll     |   72 +-
 llvm/test/CodeGen/BPF/remove_truncate_9.ll    |   40 +-
 llvm/test/CodeGen/BPF/sanity.ll               |   12 +-
 .../DirectX/Binding/binding-overlap-6.ll      |    2 +-
 .../DirectX/Binding/binding-overlap-7.ll      |    2 +-
 .../DirectX/ContainerData/PSVResources.ll     |    2 +-
 llvm/test/CodeGen/DirectX/CreateHandle.ll     |    2 +-
 .../DirectX/CreateHandleFromBinding.ll        |    2 +-
 .../CodeGen/DirectX/Metadata/srv_metadata.ll  |    4 +-
 .../CodeGen/DirectX/Metadata/uav_metadata.ll  |    4 +-
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |    3 +
 .../MIR/Generic/inline-asm-extra-info.mir     |   82 +
 .../MIR/Generic/inline-asm-unknown-kind.mir   |   11 +
 llvm/test/CodeGen/NVPTX/tcgen05-mma-i8.ll     |    8 +-
 .../test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll |   32 +-
 llvm/test/CodeGen/NVPTX/tcgen05-mma.ll        |   24 +-
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |    3 +
 .../test/CodeGen/PowerPC/dmf-outer-product.ll |  156 +-
 llvm/test/CodeGen/PowerPC/dmr-copy.ll         |   18 +-
 llvm/test/CodeGen/PowerPC/dmr-enable.ll       |   72 +-
 llvm/test/CodeGen/PowerPC/dmr-spill.ll        |    9 +-
 .../CodeGen/PowerPC/mma-acc-copy-hints.ll     |   10 +-
 llvm/test/CodeGen/PowerPC/mma-acc-memops.ll   |   44 +-
 .../test/CodeGen/PowerPC/mma-outer-product.ll |   60 +-
 .../CodeGen/PowerPC/vsx-ldst-with-length.ll   |   24 +-
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |    3 +
 llvm/test/CodeGen/RISCV/rv32p.ll              |   42 +-
 llvm/test/CodeGen/RISCV/rv32zbb.ll            |  192 +-
 llvm/test/CodeGen/RISCV/rv64zbb.ll            |  100 +-
 llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll       |  126 +-
 llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll       |  128 +-
 llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll      |   33 +-
 ...SPV_ALTERA_arbitrary_precision_integers.ll |   22 +
 .../apint-constant.ll                         |   16 +-
 .../i128-addsub.ll                            |    2 +-
 .../masked-gather-no-extension.ll             |    4 +
 .../masked-gather-scatter.ll                  |    4 +
 .../masked-scatter-no-extension.ll            |    4 +
 .../vector-of-pointers-no-extension.ll        |    3 +
 .../vector-of-pointers-ptrtoint.ll            |    4 +
 .../SPIRV/hlsl-resources/unbounded-arr.ll     |   29 +
 llvm/test/CodeGen/SPIRV/instructions/icmp.ll  |   23 +-
 llvm/test/CodeGen/SPIRV/llc-pipeline.ll       |    3 +
 .../CodeGen/SPIRV/memory-model-md-glsl450.ll  |   16 +
 .../CodeGen/SPIRV/memory-model-md-opencl.ll   |   14 +
 .../CodeGen/SPIRV/memory-model-md-shader.ll   |   16 +
 .../CodeGen/SPIRV/memory-model-md-unknown.ll  |   12 +
 .../CodeGen/SPIRV/memory-model-md-vulkan.ll   |   14 +
 .../pointers/getelementptr-vector-index.ll    |   23 +
 .../load-vector-from-array-of-vectors.ll      |   47 +
 .../single-element-vector-nested-aggregate.ll |  129 ++
 .../undef-global-aggregate-initializer.ll     |   73 +
 llvm/test/CodeGen/Thumb/smul_fix_sat.ll       |   85 +-
 .../CodeGen/Thumb2/mve-saturating-arith.ll    |   60 +-
 llvm/test/CodeGen/WebAssembly/atomic-fence.ll |  144 +-
 .../WebAssembly/atomic-mem-consistency.ll     |  945 ++++++++-
 llvm/test/CodeGen/WebAssembly/fast-isel.ll    |   19 +-
 .../CodeGen/WebAssembly/offset-atomics.ll     |   31 +
 llvm/test/CodeGen/WebAssembly/simd-bitmask.ll |  120 ++
 .../test/CodeGen/WebAssembly/wide-simd-mul.ll |  210 ++
 llvm/test/CodeGen/X86/GlobalISel/constant.ll  |   11 +-
 .../X86/GlobalISel/select-constant.mir        |   26 +-
 llvm/test/CodeGen/X86/apx/adc.ll              |  271 ++-
 llvm/test/CodeGen/X86/apx/add.ll              |  616 ++++--
 llvm/test/CodeGen/X86/apx/and.ll              |  532 +++--
 llvm/test/CodeGen/X86/apx/cmov.ll             |  153 +-
 llvm/test/CodeGen/X86/apx/dec.ll              |  123 +-
 llvm/test/CodeGen/X86/apx/imul.ll             |  143 +-
 llvm/test/CodeGen/X86/apx/inc.ll              |  155 +-
 .../X86/apx/long-instruction-fixup-x32.ll     |    2 +-
 .../CodeGen/X86/apx/long-instruction-fixup.ll |    2 +-
 llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir  |    4 +-
 llvm/test/CodeGen/X86/apx/mul-i1024.ll        |    2 +-
 llvm/test/CodeGen/X86/apx/neg.ll              |  145 +-
 llvm/test/CodeGen/X86/apx/not.ll              |   61 +-
 llvm/test/CodeGen/X86/apx/or.ll               |  582 ++++--
 llvm/test/CodeGen/X86/apx/rol.ll              |  253 ++-
 llvm/test/CodeGen/X86/apx/ror.ll              |  277 ++-
 llvm/test/CodeGen/X86/apx/sar.ll              |  277 ++-
 llvm/test/CodeGen/X86/apx/sbb.ll              |  480 ++++-
 llvm/test/CodeGen/X86/apx/shl.ll              |  257 ++-
 llvm/test/CodeGen/X86/apx/shld.ll             |  104 +-
 llvm/test/CodeGen/X86/apx/shr.ll              |  325 ++-
 llvm/test/CodeGen/X86/apx/shrd.ll             |   47 +-
 llvm/test/CodeGen/X86/apx/sub.ll              |  631 ++++--
 llvm/test/CodeGen/X86/apx/xor.ll              |  578 ++++--
 llvm/test/CodeGen/X86/asm-modifier-error.ll   |    6 +
 llvm/test/CodeGen/X86/avx2-masked-gather.ll   |   12 +-
 llvm/test/CodeGen/X86/byte-constants.ll       |   63 +
 llvm/test/CodeGen/X86/cmp.ll                  |    2 +-
 llvm/test/CodeGen/X86/divide-by-constant.ll   |    6 +-
 llvm/test/CodeGen/X86/divmod128.ll            |    8 +-
 llvm/test/CodeGen/X86/i1-fast-isel.ll         |  106 +
 .../CodeGen/X86/inline-asm-p-constraint.ll    |    5 +-
 llvm/test/CodeGen/X86/interleave-load-fold.ll |   17 +-
 llvm/test/CodeGen/X86/known-never-zero.ll     |   49 +
 llvm/test/CodeGen/X86/known-pow2.ll           |  157 +-
 .../X86/masked_gather_scatter_widen.ll        |    3 +-
 llvm/test/CodeGen/X86/masked_load.ll          |    3 +-
 llvm/test/CodeGen/X86/masked_packss.ll        |   18 +-
 llvm/test/CodeGen/X86/masked_packus.ll        |   24 +-
 llvm/test/CodeGen/X86/masked_store.ll         |    3 +-
 llvm/test/CodeGen/X86/masked_store_trunc.ll   |   12 +-
 .../CodeGen/X86/masked_store_trunc_ssat.ll    |   12 +-
 .../CodeGen/X86/masked_store_trunc_usat.ll    |   12 +-
 llvm/test/CodeGen/X86/movbe.ll                |   33 +-
 .../CodeGen/X86/mul-lohi-no-implicit-copy.ll  |   39 +
 .../X86/non-foldable-with-the-same-mask.mir   |  147 +-
 llvm/test/CodeGen/X86/opt-pipeline.ll         |    3 +
 llvm/test/CodeGen/X86/packss.ll               |   16 +-
 llvm/test/CodeGen/X86/packus.ll               |   22 +-
 llvm/test/CodeGen/X86/shift-i512.ll           |  221 +-
 llvm/test/CodeGen/X86/tailcc-largecode.ll     |   23 +-
 .../X86/vector-shuffle-combining-avx512f.ll   |   11 +
 .../CodeGen/X86/vector-shuffle-combining.ll   |   59 +-
 .../NumericalStabilitySanitizer/basic.ll      |   74 -
 .../NumericalStabilitySanitizer/intrinsics.ll | 1809 +++++++++++++++++
 .../NumericalStabilitySanitizer/libfuncs.ll   | 1432 +++++++++++++
 .../SanitizerCoverage/trace-pc-entry-exit.ll  |   52 +
 llvm/test/MC/AArch64/cyclone-movi-bug.s       |    4 +-
 llvm/test/MC/AsmParser/directive_seh.s        |    2 +-
 llvm/test/MC/AsmParser/invalid-asm-variant.s  |    2 +-
 .../MC/Disassembler/AArch64/arm64-crypto.txt  |    2 +-
 llvm/test/MC/SystemZ/insn-good-zos-pcrel.s    |    2 +-
 llvm/test/MC/X86/intel-syntax-32.s            |    2 +-
 llvm/test/MC/X86/intel-syntax-branch.s        |    6 +-
 .../MC/X86/intel-syntax-directional-label.s   |    2 +-
 llvm/test/MC/X86/intel-syntax-hex.s           |    2 +-
 llvm/test/MC/X86/intel-syntax-movabs-large.s  |    2 +-
 .../test/MC/X86/intel-syntax-unsized-memory.s |    2 +-
 llvm/test/MC/X86/intel-syntax.s               |    2 +-
 llvm/test/MC/X86/pr32530.s                    |    2 +-
 llvm/test/MC/X86/x86-32-ms-inline-asm.s       |    2 +-
 llvm/test/TableGen/x86-fold-tables.inc        |  447 ++++
 .../AArch64/lower-table-based-log2-basics.ll  |  206 ++
 .../X86/lower-table-based-log2-basics.ll      |  220 ++
 .../X86/lower-table-based-log2-negative.ll    |  264 +++
 .../Transforms/Attributor/nofpclass-frem.ll   |  158 +-
 llvm/test/Transforms/Attributor/nofpclass.ll  |   24 +-
 .../Transforms/ForcedFunctionAttrs/forced.ll  |  115 +-
 .../Inline/ret_attr_align_and_noundef.ll      |   37 +
 .../Transforms/Inline/ret_attr_nofpclass.ll   |   39 +
 llvm/test/Transforms/InstCombine/fadd.ll      |   71 +
 llvm/test/Transforms/InstCombine/frem-inf.ll  |   27 +
 llvm/test/Transforms/InstCombine/sub.ll       |   52 +
 .../Transforms/JumpThreading/thread-prob-8.ll |    1 -
 .../Transforms/LoopSimplifyCFG/mssa_term.ll   |   46 +
 ...conditional-scalar-assignment-fold-tail.ll |    7 +-
 .../AArch64/epilog-iv-select-cmp.ll           |    3 +-
 .../AArch64/epilog-vectorization-factors.ll   |    4 +-
 .../epilog-vectorization-widen-inductions.ll  |  106 +-
 .../AArch64/f128-fmuladd-reduction.ll         |    8 +-
 .../AArch64/force-target-instruction-cost.ll  |    8 +-
 .../LoopVectorize/AArch64/induction-costs.ll  |   12 +-
 .../AArch64/interleave-with-runtime-checks.ll |    8 +-
 .../LoopVectorize/AArch64/intrinsiccost.ll    |   28 +-
 .../AArch64/low_trip_count_predicates.ll      |    4 +-
 .../AArch64/neon-inloop-reductions.ll         |    2 +-
 .../partial-reduce-dot-product-epilogue.ll    |    2 +-
 .../partial-reduce-sub-epilogue-vec.ll        |    2 +-
 .../LoopVectorize/AArch64/sve-epilog-vect.ll  |    8 +-
 ...-narrow-interleave-to-widen-memory-cost.ll |   14 +-
 ...row-interleave-to-widen-memory-scalable.ll |   46 +-
 ...to-widen-memory-with-wide-ops-and-casts.ll |   40 +-
 .../LoopVectorize/AArch64/vector-reverse.ll   |    2 +-
 .../LoopVectorize/PowerPC/exit-branch-cost.ll |    2 +-
 .../PowerPC/optimal-epilog-vectorization.ll   |    4 +-
 .../conditional-scalar-assignment-vplan.ll    |   15 +-
 .../LoopVectorize/VPlan/predicator.ll         |  254 +++
 .../LoopVectorize/X86/conversion-cost.ll      |    3 +-
 .../LoopVectorize/X86/cost-model.ll           |    2 +-
 .../X86/epilog-vectorization-inductions.ll    |    7 +-
 .../LoopVectorize/X86/float-induction-x86.ll  |   10 +-
 .../LoopVectorize/X86/gather_scatter.ll       |    6 +-
 .../LoopVectorize/X86/intrinsiccost.ll        |   28 +-
 .../LoopVectorize/X86/scatter_crash.ll        |   14 +-
 ...interleave-to-widen-memory-epilogue-vec.ll |    3 +-
 ...sform-narrow-interleave-to-widen-memory.ll |    3 +-
 .../LoopVectorize/epilog-iv-select-cmp.ll     |    5 +-
 .../epilog-vectorization-any-of-reductions.ll |    7 +-
 .../epilog-vectorization-reductions.ll        |   99 +-
 .../optimal-epilog-vectorization.ll           |    8 +-
 ...28062-interleaved-accesses-narrow-group.ll |   29 +-
 .../Transforms/LoopVectorize/predicator.ll    |  236 +++
 llvm/test/Transforms/PhaseOrdering/X86/avg.ll | 1355 ++++++++++++
 .../X86/pr48844-br-to-switch-vectorization.ll |    4 +-
 .../PhaseOrdering/X86/vector-reductions.ll    |    9 +-
 .../SLPVectorizer/X86/non-power-of-2-bswap.ll |   26 +
 .../reduced-bswap-with-larger-reduced-type.ll |   30 +
 .../SLPVectorizer/non-power-of-2-bswap.ll     |   24 +-
 .../Transforms/SimplifyCFG/extract-cost.ll    |    6 +-
 .../switch-transformations-no-lut.ll          |    6 +-
 .../StructurizeCFG/hoist-zerocost-nested.ll   |  151 ++
 llvm/test/Verifier/atomics.ll                 |    4 +-
 llvm/test/lit.cfg.py                          |    2 +
 .../X86/callsite-in-lexical-block.s           |  104 +
 .../llvm-ir2vec/bindings/ir2vec-bindings.py   |  126 --
 .../llvm-ir2vec/bindings/ir2vec-exceptions.py |   38 -
 .../bindings/ir2vec-getBBEmbMap.py            |   25 +
 .../llvm-ir2vec/bindings/ir2vec-getFuncEmb.py |   21 +
 .../bindings/ir2vec-getFuncEmbMap.py          |   18 +
 .../bindings/ir2vec-getFuncNames.py           |   19 +
 .../bindings/ir2vec-getInstEmbMap.py          |   27 +
 .../bindings/ir2vec-initEmbedding.py          |   62 +
 llvm/test/tools/llvm-ir2vec/entities.ll       |   73 +-
 llvm/test/tools/llvm-ir2vec/triplets.ll       |   38 +-
 llvm/test/tools/llvm-mc/x86-asm-syntax.test   |   10 +
 llvm/tools/llvm-c-test/echo.cpp               |    8 +-
 .../tools/llvm-gpu-loader/llvm-gpu-loader.cpp |    3 +
 llvm/tools/llvm-link/llvm-link.cpp            |    9 +-
 llvm/tools/llvm-mc/llvm-mc.cpp                |   11 +-
 llvm/unittests/ADT/CMakeLists.txt             |    1 +
 llvm/unittests/ADT/RepeatedTest.cpp           |  100 +
 .../Analysis/BasicAliasAnalysisTest.cpp       |    2 +
 llvm/unittests/Analysis/CFGTest.cpp           |    9 +-
 .../unittests/Analysis/DomTreeUpdaterTest.cpp |    1 +
 llvm/unittests/Analysis/IR2VecTest.cpp        |   29 +-
 llvm/unittests/Analysis/MemorySSATest.cpp     |   33 +
 .../Frontend/OpenMPIRBuilderTest.cpp          |  180 ++
 llvm/unittests/IR/IRBuilderTest.cpp           |   21 +
 llvm/unittests/IR/InstructionsTest.cpp        |   14 +-
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    |    6 +-
 llvm/unittests/SandboxIR/TrackerTest.cpp      |    2 +-
 llvm/unittests/Support/YAMLIOTest.cpp         |    2 +-
 .../Target/AMDGPU/AMDGPUUnitTests.cpp         |    6 +
 .../DirectX/ResourceBindingAnalysisTests.cpp  |    2 +-
 .../IPO/FunctionSpecializationTest.cpp        |    2 +-
 .../Transforms/Scalar/LoopPassManagerTest.cpp |   26 +-
 llvm/unittests/Transforms/Utils/LocalTest.cpp |   11 +-
 .../Transforms/Utils/LoopUtilsTest.cpp        |    3 +-
 .../Transforms/Utils/ProfDataUtilTest.cpp     |    4 +-
 .../Transforms/Utils/SSAUpdaterBulkTest.cpp   |    2 +
 .../Utils/ScalarEvolutionExpanderTest.cpp     |    6 +-
 .../Transforms/Vectorize/VPlanTestBase.h      |    4 +-
 .../TableGen/Common/CodeGenDAGPatterns.cpp    |    5 +-
 llvm/utils/TableGen/X86FoldTablesEmitter.cpp  |   61 +
 llvm/utils/TableGen/X86ManualFoldTables.def   |  105 +
 llvm/utils/git-llvm-push                      |   56 +-
 .../clang/include/clang/Config/BUILD.gn       |    2 +
 .../gn/secondary/clang/lib/Driver/BUILD.gn    |    2 +
 .../secondary/clang/lib/FrontendTool/BUILD.gn |    2 +
 .../Frontend/BUILD.gn                         |   15 +
 llvm/utils/gn/secondary/clang/test/BUILD.gn   |    5 +-
 .../BUILD.gn                                  |    0
 .../BUILD.gn                                  |    0
 .../ScalableStaticAnalysisFramework/BUILD.gn  |    2 +
 .../secondary/lldb/include/lldb/Host/BUILD.gn |    1 +
 llvm/utils/gn/secondary/lldb/test/BUILD.gn    |    2 +
 llvm/utils/lit/lit/TestRunner.py              |    8 +-
 llvm/utils/release/build_llvm_release.bat     |   60 +-
 mlir/docs/Dialects/TOSA.md                    |   20 +
 mlir/include/mlir-c/Dialect/LLVM.h            |   78 +
 .../include/mlir/Analysis/Presburger/Matrix.h |   12 +
 mlir/include/mlir/Bindings/Python/Globals.h   |    4 +-
 mlir/include/mlir/Dialect/Arith/Utils/Utils.h |    5 +-
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    |    1 +
 .../mlir/Dialect/LLVMIR/LLVMAttrDefs.td       |   71 +
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   |   49 +
 mlir/include/mlir/Dialect/Linalg/Passes.td    |   11 +-
 .../Linalg/TransformOps/LinalgTransformOps.td |   10 +-
 .../Dialect/Linalg/Transforms/Transforms.h    |   61 +-
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |    3 +-
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td    |    5 +-
 .../mlir/Dialect/Tosa/IR/TosaOpBase.td        |    8 +-
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td  |  203 +-
 .../include/mlir/IR/BuiltinDialectBytecode.td |    7 +-
 mlir/include/mlir/IR/ExtensibleDialect.h      |    2 +-
 .../mlir/Interfaces/ControlFlowInterfaces.td  |   19 +-
 mlir/lib/Analysis/Presburger/Matrix.cpp       |  163 ++
 mlir/lib/Bindings/Python/DialectLLVM.cpp      |  166 ++
 mlir/lib/Bindings/Python/Globals.cpp          |    4 +-
 mlir/lib/Bindings/Python/IRCore.cpp           |    3 +-
 mlir/lib/CAPI/Dialect/LLVM.cpp                |   91 +
 .../Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp  |    3 +
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      |    2 +
 .../Transforms/EmulateUnsupportedFloats.cpp   |   14 +-
 mlir/lib/Dialect/Arith/Utils/CMakeLists.txt   |    1 +
 mlir/lib/Dialect/Arith/Utils/Utils.cpp        |   32 +-
 .../FuncBufferizableOpInterfaceImpl.cpp       |   14 +-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        |  162 +-
 .../Dialect/Linalg/IR/LinalgInterfaces.cpp    |    6 +-
 .../TransformOps/LinalgTransformOps.cpp       |   32 +-
 .../Linalg/Transforms/FoldIntoElementwise.cpp |   44 +-
 .../Dialect/Linalg/Transforms/Specialize.cpp  |   39 +-
 .../Dialect/Linalg/Transforms/Transforms.cpp  |  448 ++--
 .../Transforms/ExtendToSupportedTypes.cpp     |   13 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   25 +
 mlir/lib/IR/BuiltinDialectBytecode.cpp        |   22 +-
 mlir/lib/IR/Diagnostics.cpp                   |   14 +-
 mlir/lib/Interfaces/ControlFlowInterfaces.cpp |  100 +-
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        |   49 +
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  329 ++-
 mlir/python/mlir/dialects/ext.py              |   39 +-
 mlir/python/mlir/dialects/llvm.py             |   15 +-
 .../invalid-dense-elem-type-interface.mlir    |   15 -
 .../invalid/invalid-type-remapping.mlir       |   55 +
 .../vector-to-llvm-interface.mlir             |   27 +
 ...dule-bufferize-call-copy-before-write.mlir |   16 +
 mlir/test/Dialect/GPU/invalid.mlir            |   18 +-
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       |   36 +
 .../test/Dialect/Linalg/elementwise/fold.mlir |  218 +-
 .../Linalg/linalg-morph-multi-step.mlir       |   42 +-
 .../roundtrip-morphism-linalg-named-ops.mlir  |   52 +-
 .../Linalg/specialize-generic-ops.mlir        |  133 +-
 .../Linalg/transform-op-decompose.mlir        |  166 +-
 ...ransform-op-specialize-elemwise-unary.mlir |  105 +-
 mlir/test/Dialect/OpenMP/invalid.mlir         |   42 +
 mlir/test/Dialect/OpenMP/ops.mlir             |   18 +
 mlir/test/Dialect/Vector/invalid.mlir         |   16 +
 .../Target/LLVMIR/llvmir-named-metadata.mlir  |   45 +
 mlir/test/Target/LLVMIR/openmp-iterator.mlir  |  295 +++
 mlir/test/Target/LLVMIR/openmp-llvm.mlir      |   34 +
 mlir/test/Target/LLVMIR/openmp-todo.mlir      |   12 -
 mlir/test/lib/Dialect/Test/TestOpDefs.cpp     |    9 +
 mlir/test/lib/Dialect/Test/TestOps.td         |   21 +-
 .../expected-unknown-loc-unmatched.mlir       |    9 +
 mlir/test/python/dialects/llvm.py             |  137 ++
 .../python/dialects/transform_op_interface.py |    8 +-
 ...ansform_pattern_descriptor_op_interface.py |    4 +-
 .../Analysis/Presburger/MatrixTest.cpp        |   51 +
 offload/cmake/caches/AMDGPUBot.cmake          |    2 +
 openmp/runtime/src/include/omp-tools.h.var    |    1 +
 openmp/runtime/src/kmp_tasking.cpp            |   12 +
 orc-rt/docs/Design.md                         |   23 +-
 orc-rt/include/CMakeLists.txt                 |    6 +-
 orc-rt/include/orc-rt/ControllerInterface.h   |   81 +
 orc-rt/include/orc-rt/LockedAccess.h          |   86 +
 orc-rt/include/orc-rt/ResourceManager.h       |   44 -
 orc-rt/include/orc-rt/SPSWrapperFunction.h    |    7 +-
 orc-rt/include/orc-rt/Service.h               |   47 +
 orc-rt/include/orc-rt/Session.h               |   40 +-
 orc-rt/include/orc-rt/SimpleNativeMemoryMap.h |   27 +-
 orc-rt/include/orc-rt/iterator_range.h        |   47 +
 orc-rt/include/orc-rt/sps-ci/AllSPSCI.h       |   26 +
 .../sps-ci/SimpleNativeMemoryMapSPSCI.h       |   25 +
 orc-rt/lib/executor/CMakeLists.txt            |    5 +-
 orc-rt/lib/executor/ControllerInterface.cpp   |   34 +
 .../{ResourceManager.cpp => Service.cpp}      |    8 +-
 orc-rt/lib/executor/Session.cpp               |   34 +-
 orc-rt/lib/executor/SimpleNativeMemoryMap.cpp |   96 +-
 orc-rt/lib/executor/sps-ci/AllSPSCI.cpp       |   28 +
 .../sps-ci/SimpleNativeMemoryMapSPSCI.cpp     |  104 +
 orc-rt/unittests/CMakeLists.txt               |    5 +
 orc-rt/unittests/CommonTestUtils.h            |   16 +
 orc-rt/unittests/ControllerInterfaceTest.cpp  |  146 ++
 orc-rt/unittests/LockedAccessTest.cpp         |  151 ++
 orc-rt/unittests/SessionTest.cpp              |   91 +-
 .../SimpleNativeMemoryMapSPSCITest.cpp        |  323 +++
 .../unittests/SimpleNativeMemoryMapTest.cpp   |  262 +--
 orc-rt/unittests/iterator_range-test.cpp      |   79 +
 polly/lib/Analysis/ScopBuilder.cpp            |    2 +-
 polly/lib/CodeGen/LoopGeneratorsGOMP.cpp      |    3 +
 polly/lib/CodeGen/LoopGeneratorsKMP.cpp       |    3 +
 .../llvm-project-overlay/clang/BUILD.bazel    |   61 +
 .../clang/unittests/BUILD.bazel               |   27 +
 .../llvm-project-overlay/libc/BUILD.bazel     |   59 +
 .../llvm-project-overlay/lldb/BUILD.bazel     |    3 +
 .../llvm-project-overlay/mlir/BUILD.bazel     |    1 +
 utils/bazel/llvm_configs/config.h.cmake       |    3 +
 1325 files changed, 45267 insertions(+), 16389 deletions(-)
 create mode 100644 clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/readability/redundant-qualified-alias.rst
 rename clang-tools-extra/test/clang-tidy/checkers/{abseil/Inputs => Inputs/Headers/std}/type_traits (94%)
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/cstddef.h
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/initializer_list
 delete mode 100644 clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/replace-auto-ptr/memory.h
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/redundant-qualified-alias.cpp
 create mode 100644 clang/docs/LifetimeSafety.rst
 create mode 100644 clang/include/clang/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.h
 create mode 100644 clang/include/clang/ScalableStaticAnalysisFramework/SSAFBuiltinForceLinker.h
 create mode 100644 clang/include/clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h
 delete mode 100644 clang/lib/AST/ByteCode/FunctionPointer.cpp
 delete mode 100644 clang/lib/AST/ByteCode/FunctionPointer.h
 create mode 100644 clang/lib/ScalableStaticAnalysisFramework/Frontend/CMakeLists.txt
 create mode 100644 clang/lib/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.cpp
 create mode 100644 clang/lib/Sema/SemaLifetimeSafety.h
 create mode 100644 clang/test/Analysis/Scalable/command-line-interface.cpp
 create mode 100644 clang/test/Analysis/Scalable/downgradable-errors.cpp
 create mode 100644 clang/test/Analysis/Scalable/help.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/class-template-specializations.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/classes.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/diagnostic-identifiers.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/friends.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/function-templates.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/lambdas.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/macros.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/namespaces.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/nested-templates.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/statements.cpp
 create mode 100644 clang/test/Analysis/clang-suppress/template-methods.cpp
 delete mode 100644 clang/test/Analysis/suppression-attr.cpp
 create mode 100644 clang/test/C/C2y/n3517.c
 create mode 100644 clang/test/C/C2y/n3652.c
 create mode 100644 clang/test/C/C2y/n3715.c
 create mode 100644 clang/test/CIR/Lowering/binop-int-vector.cir
 create mode 100644 clang/test/CIR/Transforms/binop-traits.cir
 create mode 100644 clang/test/CIR/Transforms/max-min-idempotent.cir
 create mode 100644 clang/test/CodeGen/AArch64/neon/bf16-getset.c
 create mode 100644 clang/test/CodeGenCXX/microsoft-vector-deleting-dtors-new-array.cpp
 create mode 100644 clang/test/CodeGenSPIRV/global-dtor.cpp
 create mode 100644 clang/test/Driver/darwin-ld-platform-version-macos-nosdk.c
 create mode 100644 clang/test/Driver/print-enabled-extensions/riscv-sifive-x160.c
 create mode 100644 clang/test/Driver/print-enabled-extensions/riscv-sifive-x180.c
 create mode 100644 clang/test/Driver/print-enabled-extensions/riscv-spacemit-x60.c
 create mode 100644 clang/test/Driver/xcselect.c
 create mode 100644 clang/test/Modules/pr170099.cppm
 create mode 100644 clang/test/Modules/pr186603.cppm
 create mode 100644 clang/test/SemaCXX/deleted-template-spec-diag.cpp
 create mode 100644 clang/test/SemaCXX/dllexport-constrained-inherited-ctor.cpp
 create mode 100644 clang/test/SemaTemplate/GH176152.cpp
 rename clang/tools/{ssaf-format => clang-ssaf-format}/CMakeLists.txt (100%)
 rename clang/tools/{ssaf-format => clang-ssaf-format}/SSAFFormat.cpp (98%)
 rename clang/tools/{ssaf-linker => clang-ssaf-linker}/CMakeLists.txt (100%)
 rename clang/tools/{ssaf-linker => clang-ssaf-linker}/SSAFLinker.cpp (96%)
 create mode 100644 clang/unittests/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendActionTest.cpp
 create mode 100644 clang/unittests/ScalableStaticAnalysisFramework/SSAFBuiltinTestForceLinker.h
 create mode 100644 clang/unittests/ScalableStaticAnalysisFramework/SSAFTestForceLinker.h
 create mode 100644 flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90
 create mode 100644 flang/test/Lower/OpenMP/Todo/interchange.f90
 delete mode 100644 flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
 create mode 100644 flang/test/Parser/OpenMP/do-interchange.f90
 create mode 100644 flang/test/Parser/OpenMP/interchange-fail.f90
 create mode 100644 flang/test/Parser/OpenMP/interchange-permutation.f90
 create mode 100644 flang/test/Parser/OpenMP/interchange.f90
 create mode 100644 flang/test/Parser/shared-line-program-units.f90
 create mode 100644 flang/test/Parser/shared-line-program-units.reject.0.f90
 create mode 100644 flang/test/Parser/shared-line-program-units.reject.1.f90
 create mode 100644 flang/test/Semantics/OpenMP/interchange-permutation.f90
 create mode 100644 flang/test/Semantics/OpenMP/interchange01.f90
 create mode 100644 flang/test/Semantics/negate-literal-typedexpr.f90
 create mode 100644 libc/shared/math/atanpif16.h
 create mode 100644 libc/shared/math/bf16fma.h
 create mode 100644 libc/shared/math/log_bf16.h
 create mode 100644 libc/src/__support/math/atanpif16.h
 create mode 100644 libc/src/__support/math/bf16fma.h
 create mode 100644 libc/src/__support/math/log_bf16.h
 create mode 100644 libclc/clc/include/clc/math/clc_exp2_fast.h
 create mode 100644 libclc/clc/include/clc/math/clc_log2_fast.h
 create mode 100644 libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
 create mode 100644 libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
 create mode 100644 libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_exp2_fast.cl
 create mode 100644 libclc/clc/lib/generic/math/clc_log2_fast.cl
 delete mode 100644 libclc/clc/lib/generic/math/clc_pow.inc
 create mode 100644 libclc/clc/lib/generic/math/clc_pow_base.inc
 delete mode 100644 libclc/clc/lib/generic/math/clc_pown.inc
 delete mode 100644 libclc/clc/lib/generic/math/clc_powr.inc
 delete mode 100644 libclc/clc/lib/generic/math/clc_rootn.inc
 rename libclc/clc/lib/{ptx-nvidiacl => generic}/mem_fence/clc_mem_fence.cl (83%)
 create mode 100644 libcxx/utils/ci/lnt/runners/README.md
 create mode 100755 libcxx/utils/ci/lnt/runners/apple-m5-clang21
 create mode 100755 libcxx/utils/ci/lnt/runners/apple-m5-xcode26
 create mode 100644 lld/test/ELF/loongarch-relax-synthetic-in-text.s
 create mode 100644 lld/test/ELF/riscv-relax-synthetic-in-text.s
 create mode 100644 lldb/source/Plugins/SymbolLocator/SymStore/CMakeLists.txt
 create mode 100644 lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.cpp
 create mode 100644 lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.h
 create mode 100644 lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStoreProperties.td
 create mode 100644 lldb/test/API/symstore/Makefile
 create mode 100644 lldb/test/API/symstore/TestSymStoreLocal.py
 create mode 100644 lldb/test/API/symstore/main.c
 create mode 100644 lldb/test/Shell/Platform/AutoLoad/Darwin/dsym-python-script-name-warnings.test
 create mode 100644 lldb/tools/driver/lldb-mte-entitlements.plist
 create mode 100644 lldb/unittests/Platform/TestUtils.cpp
 create mode 100644 lldb/unittests/Platform/TestUtils.h
 create mode 100644 llvm/include/llvm/ADT/Repeated.h
 create mode 100644 llvm/lib/IR/PatternMatch.cpp
 create mode 100644 llvm/lib/Target/PowerPC/PPCRegisterClasses.td
 create mode 100644 llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-delta-signed-min.ll
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/invariant-dep-same-ptr.ll
 create mode 100644 llvm/test/Analysis/ScalarEvolution/two-loop-latches.ll
 create mode 100644 llvm/test/Assembler/byte-invalid.ll
 create mode 100644 llvm/test/Assembler/byte.ll
 create mode 100644 llvm/test/Bindings/llvm-c/byte.ll
 create mode 100644 llvm/test/CodeGen/AArch64/ccmp-look-through-copy.mir
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p3-dots-partial-reduction.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw-fmin-fmax.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.strictwqm.ll
 create mode 100644 llvm/test/CodeGen/AVR/pseudo/SPREAD.mir
 create mode 100644 llvm/test/CodeGen/MIR/Generic/inline-asm-extra-info.mir
 create mode 100644 llvm/test/CodeGen/MIR/Generic/inline-asm-unknown-kind.mir
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-resources/unbounded-arr.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/memory-model-md-glsl450.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/memory-model-md-opencl.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/memory-model-md-shader.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/memory-model-md-unknown.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/memory-model-md-vulkan.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/getelementptr-vector-index.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/load-vector-from-array-of-vectors.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/single-element-vector-nested-aggregate.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/undef-global-aggregate-initializer.ll
 create mode 100644 llvm/test/CodeGen/X86/byte-constants.ll
 create mode 100644 llvm/test/CodeGen/X86/i1-fast-isel.ll
 create mode 100644 llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll
 create mode 100644 llvm/test/Instrumentation/NumericalStabilitySanitizer/intrinsics.ll
 create mode 100644 llvm/test/Instrumentation/NumericalStabilitySanitizer/libfuncs.ll
 create mode 100644 llvm/test/Instrumentation/SanitizerCoverage/trace-pc-entry-exit.ll
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-log2-basics.ll
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
 create mode 100644 llvm/test/Transforms/InstCombine/frem-inf.ll
 create mode 100644 llvm/test/Transforms/LoopSimplifyCFG/mssa_term.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/predicator.ll
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/avg.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-bswap.ll
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/reduced-bswap-with-larger-reduced-type.ll
 create mode 100644 llvm/test/Transforms/StructurizeCFG/hoist-zerocost-nested.ll
 create mode 100644 llvm/test/tools/llvm-dwarfdump/X86/callsite-in-lexical-block.s
 delete mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py
 delete mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-exceptions.py
 create mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getBBEmbMap.py
 create mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmb.py
 create mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmbMap.py
 create mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncNames.py
 create mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getInstEmbMap.py
 create mode 100644 llvm/test/tools/llvm-ir2vec/bindings/ir2vec-initEmbedding.py
 create mode 100644 llvm/test/tools/llvm-mc/x86-asm-syntax.test
 create mode 100644 llvm/unittests/ADT/RepeatedTest.cpp
 create mode 100644 llvm/utils/gn/secondary/clang/lib/ScalableStaticAnalysisFramework/Frontend/BUILD.gn
 rename llvm/utils/gn/secondary/clang/tools/{ssaf-format => clang-ssaf-format}/BUILD.gn (100%)
 rename llvm/utils/gn/secondary/clang/tools/{ssaf-linker => clang-ssaf-linker}/BUILD.gn (100%)
 delete mode 100644 mlir/test/Bytecode/invalid/invalid-dense-elem-type-interface.mlir
 create mode 100644 mlir/test/Bytecode/invalid/invalid-type-remapping.mlir
 create mode 100644 mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-call-copy-before-write.mlir
 create mode 100644 mlir/test/Target/LLVMIR/llvmir-named-metadata.mlir
 create mode 100644 mlir/test/Target/LLVMIR/openmp-iterator.mlir
 create mode 100644 mlir/test/mlir-opt/expected-unknown-loc-unmatched.mlir
 create mode 100644 orc-rt/include/orc-rt/ControllerInterface.h
 create mode 100644 orc-rt/include/orc-rt/LockedAccess.h
 delete mode 100644 orc-rt/include/orc-rt/ResourceManager.h
 create mode 100644 orc-rt/include/orc-rt/Service.h
 create mode 100644 orc-rt/include/orc-rt/iterator_range.h
 create mode 100644 orc-rt/include/orc-rt/sps-ci/AllSPSCI.h
 create mode 100644 orc-rt/include/orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h
 create mode 100644 orc-rt/lib/executor/ControllerInterface.cpp
 rename orc-rt/lib/executor/{ResourceManager.cpp => Service.cpp} (65%)
 create mode 100644 orc-rt/lib/executor/sps-ci/AllSPSCI.cpp
 create mode 100644 orc-rt/lib/executor/sps-ci/SimpleNativeMemoryMapSPSCI.cpp
 create mode 100644 orc-rt/unittests/ControllerInterfaceTest.cpp
 create mode 100644 orc-rt/unittests/LockedAccessTest.cpp
 create mode 100644 orc-rt/unittests/SimpleNativeMemoryMapSPSCITest.cpp
 create mode 100644 orc-rt/unittests/iterator_range-test.cpp

diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index 541b33f5034b1..9c7ad73710ae2 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -81,6 +81,7 @@
     "clang": {"compiler-rt"},
     "clang-tools-extra": {"libc"},
     "libc": {"libc"},
+    "libclc": {"libclc"},
     "compiler-rt": {"compiler-rt"},
     "flang": {"flang-rt"},
     "flang-rt": {"flang-rt"},
@@ -146,6 +147,7 @@
     "flang": "check-flang",
     "flang-rt": "check-flang-rt",
     "libc": "check-libc",
+    "libclc": "check-libclc",
     "lld": "check-lld",
     "lldb": "check-lldb",
     "mlir": "check-mlir",
@@ -154,7 +156,15 @@
     "lit": "check-lit",
 }
 
-RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc", "flang-rt"}
+RUNTIMES = {
+    "libcxx",
+    "libcxxabi",
+    "libunwind",
+    "compiler-rt",
+    "libc",
+    "flang-rt",
+    "libclc",
+}
 
 # Meta projects are projects that need explicit handling but do not reside
 # in their own top level folder. To add a meta project, the start of the path
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index f0abdd708ca42..3069c66940c3a 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -259,6 +259,16 @@ def test_include_libc_in_runtimes(self):
         self.assertEqual(env_variables["runtimes_check_targets"], "check-libc")
         self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
+    def test_include_libclc_in_runtimes(self):
+        env_variables = compute_projects.get_env_variables(
+            ["libclc/CMakeLists.txt"], "Linux"
+        )
+        self.assertEqual(env_variables["projects_to_build"], "clang;llvm")
+        self.assertEqual(env_variables["project_check_targets"], "")
+        self.assertEqual(env_variables["runtimes_to_build"], "libclc")
+        self.assertEqual(env_variables["runtimes_check_targets"], "check-libclc")
+        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
+
     def test_exclude_docs(self):
         env_variables = compute_projects.get_env_variables(
             ["llvm/docs/CIBestPractices.rst"], "Linux"
@@ -297,7 +307,7 @@ def test_ci(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "check-compiler-rt check-flang-rt check-libc",
+            "check-compiler-rt check-flang-rt check-libc check-libclc",
         )
         self.assertEqual(
             env_variables["runtimes_check_targets_needs_reconfig"],
@@ -322,7 +332,7 @@ def test_windows_ci(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "check-compiler-rt",
+            "check-compiler-rt check-libclc",
         )
         self.assertEqual(
             env_variables["runtimes_check_targets_needs_reconfig"],
@@ -371,7 +381,7 @@ def test_premerge_workflow(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "check-compiler-rt check-flang-rt check-libc",
+            "check-compiler-rt check-flang-rt check-libc check-libclc",
         )
         self.assertEqual(
             env_variables["runtimes_check_targets_needs_reconfig"],
@@ -406,7 +416,7 @@ def test_third_party_benchmark(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "check-compiler-rt check-flang-rt check-libc",
+            "check-compiler-rt check-flang-rt check-libc check-libclc",
         )
         self.assertEqual(
             env_variables["runtimes_check_targets_needs_reconfig"],
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index 217da893e947a..79fc891b729d7 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -56,6 +56,7 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
       -D CMAKE_DISABLE_PRECOMPILE_HEADERS=ON \
       -D LIBCXX_CXX_ABI=libcxxabi \
+      -D LIBCLC_TARGETS_TO_BUILD="amdgcn-amd-amdhsa-llvm" \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
       -D LLDB_ENABLE_PYTHON=ON \
       -D LLDB_ENFORCE_STRICT_TEST_REQUIREMENTS=ON \
diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index a04c1ff6e41e6..f35f17350022f 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -45,11 +45,11 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D CMAKE_CXX_COMPILER_LAUNCHER=sccache \
       -D CMAKE_DISABLE_PRECOMPILE_HEADERS=ON \
       -D MLIR_ENABLE_BINDINGS_PYTHON=ON \
+      -D LIBCLC_TARGETS_TO_BUILD="amdgcn-amd-amdhsa-llvm" \
       -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
-      -D LLVM_ENABLE_RUNTIMES="${runtimes}" \
-      -D LLVM_PARALLEL_LINK_JOBS=16
+      -D LLVM_ENABLE_RUNTIMES="${runtimes}"
 
 start-group "ninja"
 
diff --git a/.github/actions/build-container/action.yml b/.github/actions/build-container/action.yml
index 1a4705348abca..8d02c041061c1 100644
--- a/.github/actions/build-container/action.yml
+++ b/.github/actions/build-container/action.yml
@@ -78,7 +78,7 @@ runs:
         echo "container-full-name=$container_name" >> $GITHUB_OUTPUT
 
     - name: Create container artifact
-      uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
       with:
         name: ${{ inputs.container-name }}-${{ runner.arch }}
         path: "*.tar"
diff --git a/.github/actions/push-container/action.yml b/.github/actions/push-container/action.yml
index 6dc364d83c852..ca017a07f7250 100644
--- a/.github/actions/push-container/action.yml
+++ b/.github/actions/push-container/action.yml
@@ -12,7 +12,7 @@ runs:
   using: "composite"
   steps:
     - name: Download container
-      uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+      uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
 
     - name: Push Container
       env:
diff --git a/.github/workflows/build-ci-container-windows.yml b/.github/workflows/build-ci-container-windows.yml
index d120c812bfc6a..799d58c81ab74 100644
--- a/.github/workflows/build-ci-container-windows.yml
+++ b/.github/workflows/build-ci-container-windows.yml
@@ -44,7 +44,7 @@ jobs:
         run: |
           docker save  ${{ steps.vars.outputs.container-name-tag }} >  ${{ steps.vars.outputs.container-filename }}
       - name: Upload container image
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: container
           path: ${{ steps.vars.outputs.container-filename }}
@@ -61,7 +61,7 @@ jobs:
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
     steps:
       - name: Download container
-        uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: container
       - name: Push Container
diff --git a/.github/workflows/ci-post-commit-analyzer.yml b/.github/workflows/ci-post-commit-analyzer.yml
index fa41a39a4f016..c22fb83fea2ab 100644
--- a/.github/workflows/ci-post-commit-analyzer.yml
+++ b/.github/workflows/ci-post-commit-analyzer.yml
@@ -87,7 +87,7 @@ jobs:
           scan-build --generate-index-only build/analyzer-results
 
       - name: Upload Results
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         with:
           name: analyzer-results
diff --git a/.github/workflows/commit-access-review.yml b/.github/workflows/commit-access-review.yml
index d860b0dce0c30..cbf20bbecd36d 100644
--- a/.github/workflows/commit-access-review.yml
+++ b/.github/workflows/commit-access-review.yml
@@ -39,7 +39,7 @@ jobs:
           python3 .github/workflows/commit-access-review.py $GITHUB_TOKEN
 
       - name: Upload Triage List
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: triagers
           path: triagers.log
diff --git a/.github/workflows/containers/libc/Dockerfile b/.github/workflows/containers/libc/Dockerfile
index e5a628f7ea8ca..232d129946daa 100644
--- a/.github/workflows/containers/libc/Dockerfile
+++ b/.github/workflows/containers/libc/Dockerfile
@@ -19,8 +19,11 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# TODO(boomanaiden154): Remove the LLVM 21 installation once we are no longer
+# using it in the libc fullbuild tests workflow.
 RUN wget https://apt.llvm.org/llvm.sh && \
     chmod +x llvm.sh && \
+    sudo ./llvm.sh 23 && \
     sudo ./llvm.sh 21 && \
     rm llvm.sh
     
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 11a453b84418e..a94f9b5a8daac 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -209,7 +209,7 @@ jobs:
           mkdir built-docs/flang
           cp -r flang-build/docs/* built-docs/flang/
       - name: Upload docs
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: docs-output
           path: built-docs/
diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index 00e528e7e1354..5d8ca8da77a41 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -39,7 +39,7 @@ jobs:
           [{"body" : "$COMMENT"}]
           EOF
 
-      - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/ids-check.yml b/.github/workflows/ids-check.yml
index 0fdf33a983a1b..f84752a41ab31 100644
--- a/.github/workflows/ids-check.yml
+++ b/.github/workflows/ids-check.yml
@@ -97,7 +97,7 @@ jobs:
             --changed-files "$CHANGED_FILES"
 
       - name: Upload results
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/libc-fullbuild-tests.yml b/.github/workflows/libc-fullbuild-tests.yml
index c796196b2180f..ec03080730dad 100644
--- a/.github/workflows/libc-fullbuild-tests.yml
+++ b/.github/workflows/libc-fullbuild-tests.yml
@@ -27,70 +27,81 @@ jobs:
         include:
           - os: ubuntu-24.04
             build_type: Debug
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: x86_64-unknown-linux-llvm
             include_scudo: ON
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04
             build_type: Release
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: x86_64-unknown-linux-llvm
             include_scudo: ON
+            build_fuzzing_tests: ON
           - os: ubuntu-24.04
             build_type: MinSizeRel
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: x86_64-unknown-linux-llvm
             include_scudo: ON
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04-arm
             build_type: Debug
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: aarch64-unknown-linux-llvm
             include_scudo: ON
+            build_fuzzing_tests: ON
           - os: ubuntu-24.04
             build_type: Debug
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: x86_64-unknown-uefi-llvm
             include_scudo: OFF
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04
             build_type: MinSizeRel
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: armv6m-none-eabi
             include_scudo: OFF
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04
             build_type: MinSizeRel
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: armv7m-none-eabi
             include_scudo: OFF
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04
             build_type: MinSizeRel
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: armv7em-none-eabi
             include_scudo: OFF
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04
             build_type: MinSizeRel
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: armv8m.main-none-eabi
             include_scudo: OFF
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04
             build_type: MinSizeRel
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: armv8.1m.main-none-eabi
             include_scudo: OFF
+            build_fuzzing_tests: OFF
           - os: ubuntu-24.04
             build_type: MinSizeRel
-            c_compiler: clang-21
-            cpp_compiler: clang++-21
+            c_compiler: clang-23
+            cpp_compiler: clang++-23
             target: riscv32-unknown-elf
             include_scudo: OFF
+            build_fuzzing_tests: OFF
           # TODO: add back gcc build when it is fixed
           # - c_compiler: gcc
           #   cpp_compiler: g++
@@ -118,7 +129,6 @@ jobs:
         echo "build-install-dir=/__w/llvm-project/llvm-project/install" >> "$GITHUB_OUTPUT"
     
     # Configure libc fullbuild with scudo.
-    # Use MinSizeRel to reduce the size of the build.
     - name: Configure CMake
       run: |
         export RUNTIMES="libc"
@@ -135,8 +145,11 @@ jobs:
           -DCMAKE_CXX_COMPILER_LAUNCHER=sccache
           -DCMAKE_INSTALL_PREFIX=${{ steps.strings.outputs.build-install-dir }}"
 
-        if [[ ${{ matrix.include_scudo}} == "ON" ]]; then
+        if [[ "${{ matrix.include_scudo }}" == "ON" || "${{ matrix.build_fuzzing_tests }}" == "ON" ]]; then
           export RUNTIMES="$RUNTIMES;compiler-rt"
+        fi
+
+        if [[ "${{ matrix.include_scudo }}" == "ON" ]]; then
           export CMAKE_FLAGS="$CMAKE_FLAGS
             -DLLVM_LIBC_INCLUDE_SCUDO=ON
             -DCOMPILER_RT_BUILD_SCUDO_STANDALONE_WITH_LLVM_LIBC=ON
@@ -158,11 +171,17 @@ jobs:
         esac
 
     - name: Build
-      run: >
-        cmake 
-        --build ${{ steps.strings.outputs.build-output-dir }} 
-        --parallel
-        --target install
+      shell: bash
+      run: |
+        TARGETS="install"
+        if [[ "${{ matrix.build_fuzzing_tests }}" == "ON" ]]; then
+          TARGETS="$TARGETS libc-fuzzer"
+        fi
+
+        cmake \
+          --build ${{ steps.strings.outputs.build-output-dir }} \
+          --parallel \
+          --target $TARGETS
 
     - name: Test
       # Skip UEFI and baremetal tests until we have testing set up.
diff --git a/.github/workflows/libclang-abi-tests.yml b/.github/workflows/libclang-abi-tests.yml
index 7a6e17a5b40b3..2c44e239f604f 100644
--- a/.github/workflows/libclang-abi-tests.yml
+++ b/.github/workflows/libclang-abi-tests.yml
@@ -120,7 +120,7 @@ jobs:
             sed -i 's/LLVM_[0-9]\+/LLVM_NOVERSION/' $lib-${{ matrix.ref }}.abi
           done
       - name: Upload ABI file
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
         with:
           name: ${{ matrix.name }}
           path: '*${{ matrix.ref }}.abi'
@@ -135,12 +135,12 @@ jobs:
       - abi-dump
     steps:
       - name: Download baseline
-        uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: build-baseline
           path: build-baseline
       - name: Download latest
-        uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: build-latest
           path: build-latest
@@ -152,7 +152,7 @@ jobs:
           done
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index aad977ce4ea96..233eb027a8b88 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -60,7 +60,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -105,7 +105,7 @@ jobs:
         env:
           CC: ${{ matrix.cc }}
           CXX: ${{ matrix.cxx }}
-      - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: ${{ matrix.config }}-${{ matrix.cxx }}-results
@@ -160,7 +160,7 @@ jobs:
         env:
           CC: clang-22
           CXX: clang++-22
-      - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         with:
           name: ${{ matrix.config }}-results
@@ -217,7 +217,7 @@ jobs:
         env:
           CC: clang
           CXX: clang++
-      - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()  # Upload artifacts even if the build or test suite fails
         with:
           name: macos-${{ matrix.config }}-results
diff --git a/.github/workflows/libcxx-build-containers.yml b/.github/workflows/libcxx-build-containers.yml
index e73f15e87a7a7..53edf390f729e 100644
--- a/.github/workflows/libcxx-build-containers.yml
+++ b/.github/workflows/libcxx-build-containers.yml
@@ -56,7 +56,7 @@ jobs:
         TAG: ${{ github.sha }}
 
     - name: Log in to GitHub Container Registry
-      uses: docker/login-action at 5e57cd118135c172c3672efd75eb46360885c0ef # v3.6.0
+      uses: docker/login-action at b45d80f862d83dbcd57f89517bcf500b2ab88fb2 # v4.0.0
       with:
         registry: ghcr.io
         username: ${{ github.actor }}
@@ -75,7 +75,7 @@ jobs:
         docker image save ghcr.io/llvm/libcxx-linux-builder-base:${{ github.sha }} | gzip > libcxx-linux-builder-base.tar.gz
         docker image save ghcr.io/llvm/libcxx-linux-builder:${{ github.sha }} | gzip > libcxx-linux-builder.tar.gz
         docker image save ghcr.io/llvm/libcxx-android-builder:${{ github.sha }} | gzip > libcxx-android-builder.tar.gz
-    - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+    - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
       with:
           name: libcxx-docker-images
           path: |
diff --git a/.github/workflows/llvm-abi-tests.yml b/.github/workflows/llvm-abi-tests.yml
index 935da79a7612b..2f2513b5a8fe5 100644
--- a/.github/workflows/llvm-abi-tests.yml
+++ b/.github/workflows/llvm-abi-tests.yml
@@ -117,14 +117,14 @@ jobs:
           # Remove symbol versioning from dumps, so we can compare across major versions.
           sed -i 's/LLVM_${{ matrix.llvm_version_major }}/LLVM_NOVERSION/' ${{ matrix.ref }}.abi
       - name: Upload ABI file
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
         with:
           name: ${{ matrix.name }}
           path: ${{ matrix.ref }}.abi
 
       - name: Upload symbol list file
         if: matrix.name == 'build-baseline'
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
         with:
           name: symbol-list
           path: llvm.symbols
@@ -139,17 +139,17 @@ jobs:
       - abi-dump
     steps:
       - name: Download baseline
-        uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: build-baseline
           path: build-baseline
       - name: Download latest
-        uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: build-latest
           path: build-latest
       - name: Download symbol list
-        uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: symbol-list
           path: symbol-list
@@ -166,7 +166,7 @@ jobs:
           abi-compliance-checker $EXTRA_ARGS -l libLLVM.so -old build-baseline/*.abi -new build-latest/*.abi || test "${{ needs.abi-dump-setup.outputs.ABI_HEADERS }}" = "llvm-c"
       - name: Upload ABI Comparison
         if: always()
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
         with:
           name: compat-report-${{ github.sha }}
           path: compat_reports/
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index eb3d405810d65..349092d2e7149 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -56,7 +56,7 @@ jobs:
             --end-rev HEAD \
             --changed-files "$CHANGED_FILES"
 
-      - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index ddb9a253444fd..3a1547ae75337 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -92,7 +92,7 @@ jobs:
             --changed-files "$CHANGED_FILES"
       
       - name: Upload results
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         with:
           name: workflow-args
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 1795d5256750f..b238b63b0a7c5 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -112,14 +112,14 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: Premerge Artifacts (Linux ${{ runner.arch }})
           path: artifacts/
           retention-days: 5
           include-hidden-files: 'true'
       - name: Upload Comment
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: ${{ always() && !startsWith(matrix.runs-on, 'depot-ubuntu-24.04-arm') }}
         continue-on-error: true
         with:
@@ -178,14 +178,14 @@ jobs:
         # https://github.com/actions/upload-artifact/issues/569
         continue-on-error: true
         if: '!cancelled()'
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: Premerge Artifacts (Windows)
           path: artifacts/
           retention-days: 5
           include-hidden-files: 'true'
       - name: Upload Comment
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         if: always()
         continue-on-error: true
         with:
diff --git a/.github/workflows/prune-branches.yml b/.github/workflows/prune-branches.yml
index 12d2c39bb4a36..b9a46619f8bd8 100644
--- a/.github/workflows/prune-branches.yml
+++ b/.github/workflows/prune-branches.yml
@@ -33,13 +33,13 @@ jobs:
           mkdir patches
           python3 .github/workflows/prune-unused-branches.py .
       - name: Upload Patches
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: BranchDiffs
           retention-days: 90
           path: patches/*.patch
       - name: Upload Branch List
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: BranchList
           retention-days: 90
diff --git a/.github/workflows/release-asset-audit.yml b/.github/workflows/release-asset-audit.yml
index 71cc23a64c3b0..19e64de8fea3f 100644
--- a/.github/workflows/release-asset-audit.yml
+++ b/.github/workflows/release-asset-audit.yml
@@ -37,7 +37,7 @@ jobs:
 
       - name: Upload comment file
         if: failure()
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: comment
           path: |
@@ -54,7 +54,7 @@ jobs:
       - audit
     steps:
       - name: Download Comment
-        uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+        uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: comment
       - name: "File Issue"
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 4001bdeec87e3..e1705e6b548b0 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -312,7 +312,7 @@ jobs:
           fi
           echo "digest=$(cat $WINDOWS_INSTALLER_FILENAME $RELEASE_BINARY_FILENAME | $sha256sum | cut -d ' ' -f 1)" >> $GITHUB_OUTPUT
 
-    - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+    - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
       id: artifact-upload
       with:
         name: ${{ runner.os }}-${{ runner.arch }}-release-binary
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index 3828633ec3921..fd9fd4055f561 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -67,7 +67,7 @@ jobs:
           ./llvm/utils/release/build-docs.sh -release "${{ inputs.release-version }}" -no-doxygen
 
       - name: Create Release Notes Artifact
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # 6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # 7.0.0
         with:
           name: release-notes
           path: docs-build/html-export/
diff --git a/.github/workflows/release-sources.yml b/.github/workflows/release-sources.yml
index 48cdd608c6225..2cba546b34a32 100644
--- a/.github/workflows/release-sources.yml
+++ b/.github/workflows/release-sources.yml
@@ -91,7 +91,7 @@ jobs:
     
       - name: Release Sources Artifact
         id: artifact-upload
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: ${{ needs.inputs.outputs.ref }}-sources
           path: |
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index cfa0f9d81dcaa..3bfaf5d82f0aa 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -112,7 +112,7 @@ jobs:
           sed -i 's/ + "dev"//g' lit/__init__.py
           python3 -m build
 
-      - uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      - uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: lit-${{ inputs.release-version }}-release-binary
           path: |
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index aa19cbf8d8f72..120a2b4e8e517 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -49,7 +49,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: SARIF file
           path: results.sarif
diff --git a/.github/workflows/test-unprivileged-download-artifact.yml b/.github/workflows/test-unprivileged-download-artifact.yml
index ce807287abd2e..bad0e31195f3e 100644
--- a/.github/workflows/test-unprivileged-download-artifact.yml
+++ b/.github/workflows/test-unprivileged-download-artifact.yml
@@ -26,13 +26,13 @@ jobs:
           echo "foo" > comment1
           echo "bar" > comment2
       - name: Upload Test File 1
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: artifact-name-1
           path: |
             comment1
       - name: Upload Test File 2
-        uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+        uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
         with:
           name: artifact-name-2
           path: |
diff --git a/.github/workflows/upload-release-artifact/action.yml b/.github/workflows/upload-release-artifact/action.yml
index 51ecf0705e4d8..e086d14fb96e2 100644
--- a/.github/workflows/upload-release-artifact/action.yml
+++ b/.github/workflows/upload-release-artifact/action.yml
@@ -40,7 +40,7 @@ runs:
   using: "composite"
   steps:
     - name: Download Artifact
-      uses: actions/download-artifact at 37930b1c2abaa49bbe596cd826c3c89aef350131 # v7.0.0
+      uses: actions/download-artifact at 3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
       id: download-artifact
       with:
         artifact-ids: ${{ inputs.artifact-id }}
@@ -78,7 +78,7 @@ runs:
         done
 
     - name: Upload Build Provenance
-      uses: actions/upload-artifact at b7c566a772e6b6bfb58ed0dc250532a479d7789f # v6.0.0
+      uses: actions/upload-artifact at bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0
       with:
         name: ${{ inputs.attestation-name }}
         path: |
diff --git a/clang-tools-extra/clang-tidy/misc/MultipleInheritanceCheck.cpp b/clang-tools-extra/clang-tidy/misc/MultipleInheritanceCheck.cpp
index 557b4559697b9..72e6aa6ac0b47 100644
--- a/clang-tools-extra/clang-tidy/misc/MultipleInheritanceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MultipleInheritanceCheck.cpp
@@ -58,16 +58,35 @@ void MultipleInheritanceCheck::registerMatchers(MatchFinder *Finder) {
 
 void MultipleInheritanceCheck::check(const MatchFinder::MatchResult &Result) {
   const auto &D = *Result.Nodes.getNodeAs<CXXRecordDecl>("decl");
-  // Check to see if the class inherits from multiple concrete classes.
-  unsigned NumConcrete =
-      llvm::count_if(D.bases(), [&](const CXXBaseSpecifier &I) {
-        return !I.isVirtual() && !isInterface(I);
-      });
+  // Collect the direct and virtual concrete bases of the class.
+  SmallVector<const CXXRecordDecl *> DirectConcreteBases;
+  for (const CXXBaseSpecifier &Base : D.bases())
+    if (!Base.isVirtual() && !isInterface(Base))
+      DirectConcreteBases.push_back(Base.getType()->getAsCXXRecordDecl());
+
+  SmallVector<const CXXRecordDecl *> VirtualConcreteBases;
+  for (const CXXBaseSpecifier &VBase : D.vbases())
+    if (!isInterface(VBase))
+      VirtualConcreteBases.push_back(VBase.getType()->getAsCXXRecordDecl());
 
-  // Check virtual bases to see if there is more than one concrete
-  // non-virtual base.
+  unsigned NumConcrete = DirectConcreteBases.size();
+
+  // Count only virtual concrete bases that introduce an additional
+  // implementation base, skipping those already represented by a more derived
+  // concrete base.
   NumConcrete += llvm::count_if(
-      D.vbases(), [&](const CXXBaseSpecifier &V) { return !isInterface(V); });
+      VirtualConcreteBases, [&](const CXXRecordDecl *VirtualBase) {
+        const bool HiddenByMoreDerivedVirtualBase = llvm::any_of(
+            VirtualConcreteBases, [&](const CXXRecordDecl *OtherVirtualBase) {
+              return VirtualBase != OtherVirtualBase &&
+                     OtherVirtualBase->isVirtuallyDerivedFrom(VirtualBase);
+            });
+        const bool HiddenByDirectConcreteBase = llvm::any_of(
+            DirectConcreteBases, [&](const CXXRecordDecl *DirectBase) {
+              return DirectBase->isVirtuallyDerivedFrom(VirtualBase);
+            });
+        return !HiddenByMoreDerivedVirtualBase && !HiddenByDirectConcreteBase;
+      });
 
   if (NumConcrete > 1)
     diag(D.getBeginLoc(), "inheriting multiple classes that aren't "
diff --git a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp
index 92e3220fdb817..1067fca289a2c 100644
--- a/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/InefficientStringConcatenationCheck.cpp
@@ -53,11 +53,11 @@ void InefficientStringConcatenationCheck::registerMatchers(
     Finder->addMatcher(cxxOperatorCallExpr(anyOf(AssignOperator, PlusOperator)),
                        this);
   } else {
-    Finder->addMatcher(
-        cxxOperatorCallExpr(anyOf(AssignOperator, PlusOperator),
-                            hasAncestor(stmt(anyOf(cxxForRangeStmt(),
-                                                   whileStmt(), forStmt())))),
-        this);
+    Finder->addMatcher(cxxOperatorCallExpr(anyOf(AssignOperator, PlusOperator),
+                                           hasAncestor(stmt(anyOf(
+                                               cxxForRangeStmt(), whileStmt(),
+                                               forStmt(), doStmt())))),
+                       this);
   }
 }
 
diff --git a/clang-tools-extra/clang-tidy/performance/UseStdMoveCheck.cpp b/clang-tools-extra/clang-tidy/performance/UseStdMoveCheck.cpp
index 2a7df4142a6de..7c3bbc3187cd9 100644
--- a/clang-tools-extra/clang-tidy/performance/UseStdMoveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/UseStdMoveCheck.cpp
@@ -24,9 +24,12 @@ namespace clang::tidy::performance {
 
 namespace {
 AST_MATCHER(CXXRecordDecl, hasAccessibleNonTrivialMoveAssignment) {
-  if (!Node.hasNonTrivialMoveAssignment())
+  const CXXRecordDecl *ND = Node.getDefinition();
+  if (!ND)
     return false;
-  for (const auto *CM : Node.methods())
+  if (!ND->hasNonTrivialMoveAssignment())
+    return false;
+  for (const CXXMethodDecl *CM : ND->methods())
     if (CM->isMoveAssignmentOperator())
       return !CM->isDeleted() && CM->getAccess() == AS_public;
   llvm_unreachable("Move Assignment Operator Not Found");
diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
index f1f3cde32feff..686e7c19d650b 100644
--- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
@@ -47,6 +47,7 @@ add_clang_library(clangTidyReadabilityModule STATIC
   RedundantMemberInitCheck.cpp
   RedundantParenthesesCheck.cpp
   RedundantPreprocessorCheck.cpp
+  RedundantQualifiedAliasCheck.cpp
   RedundantSmartptrGetCheck.cpp
   RedundantStringCStrCheck.cpp
   RedundantStringInitCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
index 342b57840533d..8e1162ff8b073 100644
--- a/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ElseAfterReturnCheck.cpp
@@ -45,6 +45,20 @@ AST_MATCHER_P(Stmt, stripLabelLikeStatements,
   return InnerMatcher.matches(*S, Finder, Builder);
 }
 
+AST_MATCHER_P(Stmt, hasFinalStmt, ast_matchers::internal::Matcher<Stmt>,
+              InnerMatcher) {
+  for (const Stmt *S = &Node;;) {
+    S = S->stripLabelLikeStatements();
+    if (const auto *Compound = dyn_cast<CompoundStmt>(S)) {
+      if (Compound->body_empty())
+        return false;
+      S = Compound->body_back();
+    } else {
+      return InnerMatcher.matches(*S, Finder, Builder);
+    }
+  }
+}
+
 } // namespace
 
 static constexpr char InterruptingStr[] = "interrupting";
@@ -172,16 +186,13 @@ void ElseAfterReturnCheck::registerPPCallbacks(const SourceManager &SM,
 }
 
 void ElseAfterReturnCheck::registerMatchers(MatchFinder *Finder) {
-  const auto InterruptsControlFlow = stmt(anyOf(
-      returnStmt().bind(InterruptingStr), continueStmt().bind(InterruptingStr),
-      breakStmt().bind(InterruptingStr), cxxThrowExpr().bind(InterruptingStr),
-      callExpr(callee(functionDecl(isNoReturn()))).bind(InterruptingStr)));
+  const auto InterruptsControlFlow =
+      stmt(anyOf(returnStmt(), continueStmt(), breakStmt(), cxxThrowExpr(),
+                 callExpr(callee(functionDecl(isNoReturn())))));
 
   const auto IfWithInterruptingThenElse =
       ifStmt(unless(isConstexpr()), unless(isConsteval()),
-             hasThen(stripLabelLikeStatements(
-                 stmt(anyOf(InterruptsControlFlow,
-                            compoundStmt(has(InterruptsControlFlow)))))),
+             hasThen(hasFinalStmt(InterruptsControlFlow.bind(InterruptingStr))),
              hasElse(stmt().bind("else")))
           .bind("if");
 
diff --git a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
index a0b15603b36e8..a138d1900b799 100644
--- a/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ImplicitBoolConversionCheck.cpp
@@ -233,7 +233,7 @@ static bool isCastAllowedInCondition(const ImplicitCastExpr *Cast,
           isa<BinaryConditionalOperator>(S))
         return true;
       if (isa<ParenExpr>(S) || isa<ImplicitCastExpr>(S) ||
-          isUnaryLogicalNotOperator(S) ||
+          isa<ExprWithCleanups>(S) || isUnaryLogicalNotOperator(S) ||
           (isa<BinaryOperator>(S) && cast<BinaryOperator>(S)->isLogicalOp())) {
         Q.push(S);
       } else {
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index c582dc98eac6b..8e9e00b23c84a 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -49,6 +49,7 @@
 #include "RedundantMemberInitCheck.h"
 #include "RedundantParenthesesCheck.h"
 #include "RedundantPreprocessorCheck.h"
+#include "RedundantQualifiedAliasCheck.h"
 #include "RedundantSmartptrGetCheck.h"
 #include "RedundantStringCStrCheck.h"
 #include "RedundantStringInitCheck.h"
@@ -148,6 +149,8 @@ class ReadabilityModule : public ClangTidyModule {
         "readability-redundant-parentheses");
     CheckFactories.registerCheck<RedundantPreprocessorCheck>(
         "readability-redundant-preprocessor");
+    CheckFactories.registerCheck<RedundantQualifiedAliasCheck>(
+        "readability-redundant-qualified-alias");
     CheckFactories.registerCheck<RedundantTypenameCheck>(
         "readability-redundant-typename");
     CheckFactories.registerCheck<ReferenceToConstructedTemporaryCheck>(
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.cpp
new file mode 100644
index 0000000000000..a306f205a8447
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.cpp
@@ -0,0 +1,220 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RedundantQualifiedAliasCheck.h"
+#include "../utils/LexerUtils.h"
+#include <cassert>
+#include <optional>
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::readability {
+
+namespace {
+
+struct TypeLocInfo {
+  TypeLoc Loc;
+  bool HasQualifier = false;
+};
+
+} // namespace
+
+static bool hasMacroInRange(SourceRange Range, const SourceManager &SM,
+                            const LangOptions &LangOpts) {
+  if (Range.isInvalid())
+    return true;
+  return utils::lexer::rangeContainsExpansionsOrDirectives(Range, SM, LangOpts);
+}
+
+static std::optional<TypeLocInfo> getTypeLocInfo(TypeLoc TL) {
+  if (TL.isNull())
+    return std::nullopt;
+
+  const auto MakeTypeLocInfo = [](auto TypeTL) {
+    const bool HasQualifier =
+        static_cast<bool>(TypeTL.getQualifierLoc().getNestedNameSpecifier());
+    return TypeLocInfo{TypeTL, HasQualifier};
+  };
+
+  if (const auto TypedefTL = TL.getAs<TypedefTypeLoc>())
+    return MakeTypeLocInfo(TypedefTL);
+
+  if (const auto TagTL = TL.getAs<TagTypeLoc>())
+    return MakeTypeLocInfo(TagTL);
+
+  return std::nullopt;
+}
+
+static const NamedDecl *getNamedDeclFromTypeLoc(TypeLoc TL) {
+  if (const auto TypedefTL = TL.getAs<TypedefTypeLoc>())
+    return TypedefTL.getDecl();
+  if (const auto TagTL = TL.getAs<TagTypeLoc>())
+    return TagTL.getDecl();
+  return nullptr;
+}
+
+static bool hasSameUnqualifiedName(const NamedDecl *LHS, const NamedDecl *RHS) {
+  return LHS->getName() == RHS->getName();
+}
+
+static bool isNamespaceLikeDeclContext(const DeclContext *DC) {
+  return isa<TranslationUnitDecl, NamespaceDecl>(DC);
+}
+
+static bool canUseUsingDeclarationForTarget(const TypeAliasDecl *Alias,
+                                            const NamedDecl *Target) {
+  const DeclContext *AliasContext = Alias->getDeclContext()->getRedeclContext();
+  const DeclContext *TargetContext =
+      Target->getDeclContext()->getRedeclContext();
+
+  const auto *AliasRecord = dyn_cast<CXXRecordDecl>(AliasContext);
+  if (!AliasRecord)
+    return isNamespaceLikeDeclContext(TargetContext);
+
+  const auto *TargetRecord = dyn_cast<CXXRecordDecl>(TargetContext);
+  return TargetRecord && AliasRecord->isDerivedFrom(TargetRecord);
+}
+
+static bool hasTrailingSyntaxAfterRhsType(TypeLoc TL, const SourceManager &SM,
+                                          const LangOptions &LangOpts) {
+  const SourceLocation TypeEndLoc = TL.getEndLoc();
+  if (TypeEndLoc.isInvalid() || TypeEndLoc.isMacroID())
+    return true;
+  const std::optional<Token> NextToken =
+      utils::lexer::findNextTokenSkippingComments(TypeEndLoc, SM, LangOpts);
+  return !NextToken || NextToken->isNot(tok::semi);
+}
+
+namespace {
+
+AST_MATCHER(TypeAliasDecl, isAliasTemplate) {
+  return Node.getDescribedAliasTemplate() != nullptr;
+}
+
+AST_MATCHER(NamedDecl, isInMacro) { return Node.getLocation().isMacroID(); }
+
+AST_MATCHER(TypeAliasDecl, hasAliasAttributes) {
+  if (Node.hasAttrs())
+    return true;
+  const TypeSourceInfo *TSI = Node.getTypeSourceInfo();
+  if (!TSI)
+    return false;
+  for (TypeLoc CurTL = TSI->getTypeLoc(); !CurTL.isNull();
+       CurTL = CurTL.getNextTypeLoc())
+    if (CurTL.getAs<AttributedTypeLoc>())
+      return true;
+  return false;
+}
+
+AST_MATCHER(TypeLoc, isNonDependentTypeLoc) {
+  return !Node.getType().isNull() && !Node.getType()->isDependentType();
+}
+
+AST_MATCHER(TypeLoc, isNonElaboratedTypeLoc) {
+  const auto IsNonElaboratedTypeLoc = [](auto TL) {
+    return !TL.isNull() && !TL.getElaboratedKeywordLoc().isValid();
+  };
+  return IsNonElaboratedTypeLoc(Node.getAs<TypedefTypeLoc>()) ||
+         IsNonElaboratedTypeLoc(Node.getAs<TagTypeLoc>());
+}
+
+AST_MATCHER(TypeLoc, isMacroFreeTypeLoc) {
+  const ASTContext &Context = Finder->getASTContext();
+  return !hasMacroInRange(Node.getSourceRange(), Context.getSourceManager(),
+                          Context.getLangOpts());
+}
+
+AST_MATCHER(TypeLoc, hasNoTrailingSyntaxAfterTypeLoc) {
+  const ASTContext &Context = Finder->getASTContext();
+  return !hasTrailingSyntaxAfterRhsType(Node, Context.getSourceManager(),
+                                        Context.getLangOpts());
+}
+
+AST_MATCHER(TypeAliasDecl, hasUsingDeclarationEquivalentTarget) {
+  const TypeSourceInfo *TSI = Node.getTypeSourceInfo();
+  if (!TSI)
+    return false;
+  const std::optional<TypeLocInfo> TypeInfo = getTypeLocInfo(TSI->getTypeLoc());
+  if (!TypeInfo || !TypeInfo->HasQualifier)
+    return false;
+  const NamedDecl *Target = getNamedDeclFromTypeLoc(TypeInfo->Loc);
+  return Target && hasSameUnqualifiedName(&Node, Target) &&
+         canUseUsingDeclarationForTarget(&Node, Target);
+}
+
+} // namespace
+
+RedundantQualifiedAliasCheck::RedundantQualifiedAliasCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      OnlyNamespaceScope(Options.get("OnlyNamespaceScope", false)) {}
+
+void RedundantQualifiedAliasCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "OnlyNamespaceScope", OnlyNamespaceScope);
+}
+
+void RedundantQualifiedAliasCheck::registerMatchers(MatchFinder *Finder) {
+  const auto ControlFlowInitStatementMatcher = stmt(
+      anyOf(mapAnyOf(ifStmt, switchStmt, cxxForRangeStmt)
+                .with(hasInitStatement(stmt(equalsBoundNode("initDeclStmt")))),
+            forStmt(hasLoopInit(stmt(equalsBoundNode("initDeclStmt"))))));
+
+  const auto AliasPreconditions =
+      allOf(unless(isInMacro()), unless(isAliasTemplate()),
+            unless(hasAliasAttributes()));
+  const auto InControlFlowInit =
+      allOf(hasParent(declStmt().bind("initDeclStmt")),
+            hasAncestor(ControlFlowInitStatementMatcher));
+  const auto RewriteableTypeLoc =
+      typeLoc(allOf(isNonDependentTypeLoc(), isNonElaboratedTypeLoc(),
+                    isMacroFreeTypeLoc(), hasNoTrailingSyntaxAfterTypeLoc()))
+          .bind("loc");
+
+  const auto RedundantQualifiedAliasMatcher = typeAliasDecl(
+      AliasPreconditions, unless(InControlFlowInit),
+      hasUsingDeclarationEquivalentTarget(), hasTypeLoc(RewriteableTypeLoc));
+
+  if (OnlyNamespaceScope) {
+    Finder->addMatcher(typeAliasDecl(RedundantQualifiedAliasMatcher,
+                                     hasDeclContext(anyOf(translationUnitDecl(),
+                                                          namespaceDecl())))
+                           .bind("alias"),
+                       this);
+    return;
+  }
+  Finder->addMatcher(RedundantQualifiedAliasMatcher.bind("alias"), this);
+}
+
+void RedundantQualifiedAliasCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const auto *Alias = Result.Nodes.getNodeAs<TypeAliasDecl>("alias");
+  assert(Alias && "matcher must bind alias");
+  const auto *WrittenTLNode = Result.Nodes.getNodeAs<TypeLoc>("loc");
+  assert(WrittenTLNode && "matcher must bind loc");
+  const TypeLoc WrittenTL = *WrittenTLNode;
+
+  const SourceManager &SM = *Result.SourceManager;
+  const LangOptions &LangOpts = getLangOpts();
+
+  const SourceLocation AliasLoc = Alias->getLocation();
+  const SourceLocation RhsBeginLoc = WrittenTL.getBeginLoc();
+  const CharSourceRange EqualRange = utils::lexer::findTokenTextInRange(
+      CharSourceRange::getCharRange(AliasLoc, RhsBeginLoc), SM, LangOpts,
+      [](const Token &Tok) { return Tok.is(tok::equal); });
+  if (EqualRange.isInvalid())
+    return;
+
+  auto Diag = diag(Alias->getLocation(),
+                   "type alias is redundant; use a using-declaration instead");
+
+  Diag << FixItHint::CreateRemoval(Alias->getLocation())
+       << FixItHint::CreateRemoval(EqualRange.getBegin());
+}
+
+} // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.h
new file mode 100644
index 0000000000000..4290f36c70952
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/RedundantQualifiedAliasCheck.h
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTQUALIFIEDALIASCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTQUALIFIEDALIASCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::readability {
+
+/// Finds identity type aliases to qualified names that can be expressed as
+/// using-declarations.
+///
+/// For the user-facing documentation see:
+/// https://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-qualified-alias.html
+class RedundantQualifiedAliasCheck : public ClangTidyCheck {
+public:
+  RedundantQualifiedAliasCheck(StringRef Name, ClangTidyContext *Context);
+  bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
+    return LangOpts.CPlusPlus11;
+  }
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+
+private:
+  const bool OnlyNamespaceScope;
+};
+
+} // namespace clang::tidy::readability
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTQUALIFIEDALIASCHECK_H
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index f80f7325d17ae..064e0db3b5b14 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -845,14 +845,14 @@ refInTypeLoc(TypeLoc L, const HeuristicResolver *Resolver) {
 
     void VisitUnresolvedUsingTypeLoc(UnresolvedUsingTypeLoc L) {
       Refs.push_back(ReferenceLoc{L.getQualifierLoc(),
-                                  L.getLocalSourceRange().getBegin(),
+                                  L.getNameLoc(),
                                   /*IsDecl=*/false,
                                   {L.getDecl()}});
     }
 
     void VisitUsingTypeLoc(UsingTypeLoc L) {
       Refs.push_back(ReferenceLoc{L.getQualifierLoc(),
-                                  L.getLocalSourceRange().getBegin(),
+                                  L.getNameLoc(),
                                   /*IsDecl=*/false,
                                   {L.getDecl()}});
     }
diff --git a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
index 94cecce1f038c..782221306bf85 100644
--- a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
@@ -1100,7 +1100,13 @@ sizeof...($TemplateParameter[[Elements]]);
         struct $Class_def[[Inner]] {};
       };
       using $Typedef_decl[[Alias]] = void ($Class[[Outer]]::$Class[[Inner]]:: *)();
-    )cpp"};
+    )cpp",
+      // Forwarded typedef
+      R"cpp(
+      using $Primitive_decl[[MyInt]] = int;
+      namespace $Namespace_decl[[N]] { using ::MyInt; }
+      using $Primitive_decl[[X]] = $Namespace[[N]]::$Primitive[[MyInt]];
+      )cpp"};
   for (const auto &TestCase : TestCases)
     // Mask off scope modifiers to keep the tests manageable.
     // They're tested separately.
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index f0607996c5ff2..c9a170a9e8660 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -154,6 +154,12 @@ New checks
   Suggests insertion of ``std::move(...)`` to turn copy assignment operator
   calls into move assignment ones, when deemed valid and profitable.
 
+- New :doc:`readability-redundant-qualified-alias
+  <clang-tidy/checks/readability/redundant-qualified-alias>` check.
+
+  Finds redundant identity type aliases that re-expose a qualified name and can
+  be replaced with a ``using`` declaration.
+
 - New :doc:`readability-trailing-comma
   <clang-tidy/checks/readability/trailing-comma>` check.
 
@@ -258,6 +264,10 @@ Changes in existing checks
   - Fixed false positive where an array of pointers to ``const`` was
     incorrectly diagnosed as allowing the pointee to be made ``const``.
 
+- Improved :doc:`misc-multiple-inheritance
+  <clang-tidy/checks/misc/multiple-inheritance>` by avoiding false positives when
+  virtual inheritance causes concrete bases to be counted more than once.
+
 - Improved :doc:`misc-throw-by-value-catch-by-reference
   <clang-tidy/checks/misc/throw-by-value-catch-by-reference>` check:
 
@@ -311,6 +321,11 @@ Changes in existing checks
 
   - Fixes false negatives when using ``std::set`` from ``libstdc++``.
 
+- Improved :doc:`performance-inefficient-string-concatenation
+  <clang-tidy/checks/performance/inefficient-string-concatenation>` check by
+  adding support for detecting inefficient string concatenation in ``do-while``
+  loops.
+
 - Improved :doc:`performance-inefficient-vector-operation
   <clang-tidy/checks/performance/inefficient-vector-operation>` check by
   correctly handling vector-like classes when ``push_back``/``emplace_back`` are
@@ -334,6 +349,9 @@ Changes in existing checks
   - Fixed missed diagnostics when ``if`` statements appear in unbraced
     ``switch`` case labels.
 
+  - Fixed a false positive involving ``if`` statements which contain
+    a ``return``, ``break``, etc., jumped over by a ``goto``.
+
   - Added support for handling attributed ``if`` then-branches such as
     ``[[likely]]`` and ``[[unlikely]]``.
 
@@ -345,6 +363,13 @@ Changes in existing checks
   now uses separate note diagnostics for each uninitialized enumerator, making
   it easier to see which specific enumerators need explicit initialization.
 
+- Improved :doc:`readability-implicit-bool-conversion
+  <clang-tidy/checks/readability/implicit-bool-conversion>` check by fixing a
+  false positive where `AllowPointerConditions` and `AllowIntegerConditions`
+  options did not suppress warnings when the condition expression involved
+  temporaries (e.g. passing a string literal to a ``const std::string&``
+  parameter)
+
 - Improved :doc:`readability-non-const-parameter
   <clang-tidy/checks/readability/non-const-parameter>` check by avoiding false
   positives on parameters used in dependent expressions (e.g. inside generic
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index fcde0ea474913..ceab1e9414951 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -417,6 +417,7 @@ Clang-Tidy Checks
    :doc:`readability-redundant-member-init <readability/redundant-member-init>`, "Yes"
    :doc:`readability-redundant-parentheses <readability/redundant-parentheses>`, "Yes"
    :doc:`readability-redundant-preprocessor <readability/redundant-preprocessor>`,
+   :doc:`readability-redundant-qualified-alias <readability/redundant-qualified-alias>`, "Yes"
    :doc:`readability-redundant-smartptr-get <readability/redundant-smartptr-get>`, "Yes"
    :doc:`readability-redundant-string-cstr <readability/redundant-string-cstr>`, "Yes"
    :doc:`readability-redundant-string-init <readability/redundant-string-init>`, "Yes"
diff --git a/clang-tools-extra/docs/clang-tidy/checks/performance/inefficient-string-concatenation.rst b/clang-tools-extra/docs/clang-tidy/checks/performance/inefficient-string-concatenation.rst
index 92b6b4e0370d6..56f6b9640080d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/performance/inefficient-string-concatenation.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/performance/inefficient-string-concatenation.rst
@@ -55,5 +55,5 @@ Options
 
 .. option:: StrictMode
 
-   When `false`, the check will only check the string usage in ``while``, ``for``
-   and ``for-range`` statements. Default is `false`.
+   When `false`, the check will only warn on inefficient string usage inside loops.
+   Default is `false`.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-qualified-alias.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-qualified-alias.rst
new file mode 100644
index 0000000000000..b1af171ae5093
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-qualified-alias.rst
@@ -0,0 +1,30 @@
+.. title:: clang-tidy - readability-redundant-qualified-alias
+
+readability-redundant-qualified-alias
+=====================================
+
+Finds redundant identity type aliases that re-expose a qualified name and can
+be replaced with a ``using`` declaration.
+
+.. code-block:: c++
+
+  using seconds = std::chrono::seconds;
+
+  // becomes
+
+  using std::chrono::seconds;
+
+The check is conservative and only warns when the alias name exactly matches
+the unqualified name of a non-dependent, non-specialized named type written
+with a qualifier. It skips alias templates, dependent forms, elaborated
+keywords (``class``, ``struct``, ``enum``, ``typename``), and cases involving
+macros.
+
+Options
+-------
+
+.. option:: OnlyNamespaceScope
+
+   When `true`, only consider aliases declared in a namespace or the
+   translation unit. When `false`, also consider aliases declared inside
+   classes, functions, and lambdas. Default is `false`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/cstddef b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/cstddef
index 800285e887cda..c2200b06e6a23 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/cstddef
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/cstddef
@@ -14,6 +14,8 @@
 namespace std {
   using ::ptrdiff_t;
   using ::size_t;
+
+  using nullptr_t = decltype(nullptr);
 }
 
 #endif // _CSTDDEF_
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/memory b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/memory
index 2ec18dbec18f4..201ebd59555f3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/memory
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/memory
@@ -1,8 +1,178 @@
 #ifndef _MEMORY_
 #define _MEMORY_
 
+#include "stddef.h"
+
 namespace std {
 
+template <typename T>
+struct default_delete {};
+
+template <typename T>
+struct default_delete<T[]> {};
+
+template <typename T, typename Deleter = default_delete<T>>
+class unique_ptr {
+public:
+  unique_ptr() noexcept {}
+  explicit unique_ptr(T *p) noexcept {}
+  unique_ptr(T *p, Deleter d) noexcept {}
+  unique_ptr(const unique_ptr &) = delete;
+  unique_ptr(unique_ptr &&t) noexcept {}
+  template <typename U, typename E>
+  unique_ptr(unique_ptr<U, E> &&t) noexcept {}
+  ~unique_ptr() {}
+
+  T &operator*() const { return *ptr; }
+  T *operator->() const { return ptr; }
+  explicit operator bool() const noexcept { return ptr != nullptr; }
+
+  T *get() const { return ptr; }
+  T *release() { return ptr; }
+  void reset() {}
+  void reset(T *p) {}
+
+  unique_ptr &operator=(unique_ptr &) = delete;
+  template <typename U, typename E>
+  unique_ptr &operator=(unique_ptr<U, E> &) = delete;
+  unique_ptr &operator=(unique_ptr &&) noexcept { return *this; }
+  template <typename U, typename E>
+  unique_ptr &operator=(unique_ptr<U, E> &&) noexcept { return *this; }
+
+  bool operator==(const unique_ptr &) const noexcept { return false; }
+  bool operator!=(const unique_ptr &) const noexcept { return true; }
+
+private:
+  T *ptr = nullptr;
+};
+
+template <typename T, typename Deleter>
+class unique_ptr<T[], Deleter> {
+public:
+  unique_ptr() noexcept {}
+  template <typename U>
+  explicit unique_ptr(U p) noexcept {}
+  template <typename U>
+  unique_ptr(U p, Deleter d) noexcept {}
+  ~unique_ptr() {}
+
+  T &operator[](size_t i) const { return ptr[i]; }
+  T *get() const { return ptr; }
+  explicit operator bool() const noexcept { return ptr != nullptr; }
+
+  void reset() {}
+  void reset(T *p) {}
+
+private:
+  T *ptr = nullptr;
+};
+
+template <typename T, typename... Args>
+unique_ptr<T> make_unique(Args &&...args) {
+  return unique_ptr<T>(new T(static_cast<Args &&>(args)...));
+}
+
+template <typename T>
+class shared_ptr {
+public:
+  shared_ptr() {}
+  explicit shared_ptr(T *p) {}
+  template <typename Y>
+  explicit shared_ptr(Y *p) {}
+  template <typename Y, typename D>
+  shared_ptr(Y *p, D d) {}
+  shared_ptr(const shared_ptr &) {}
+  shared_ptr(shared_ptr &&) {}
+  ~shared_ptr() {}
+
+  T &operator*() const { return *this->get(); }
+  T *operator->() const { return this->get(); }
+  T *get() const { return ptr; }
+  void reset() {}
+  void reset(T *p) {}
+  explicit operator bool() const noexcept { return this->get() != nullptr; }
+
+  shared_ptr &operator=(shared_ptr &&) { return *this; }
+  template <typename U>
+  shared_ptr &operator=(shared_ptr<U> &&) { return *this; }
+
+private:
+  T *ptr = nullptr;
+};
+
+template <typename T>
+class shared_ptr<T[]> {
+public:
+  shared_ptr() {}
+  explicit shared_ptr(T *p) {}
+  template <typename Y>
+  explicit shared_ptr(Y *p) {}
+  template <typename Y, typename D>
+  shared_ptr(Y *p, D d) {}
+  shared_ptr(const shared_ptr &) {}
+  shared_ptr(shared_ptr &&) {}
+  ~shared_ptr() {}
+
+  T &operator[](size_t i) const { return ptr[i]; }
+  T *get() const { return ptr; }
+  void reset() {}
+  void reset(T *p) {}
+  explicit operator bool() const noexcept { return ptr != nullptr; }
+
+private:
+  T *ptr = nullptr;
+};
+
+template <typename T, typename... Args>
+shared_ptr<T> make_shared(Args &&...args) {
+  return shared_ptr<T>(new T(static_cast<Args &&>(args)...));
+}
+
+template <typename T>
+class weak_ptr {
+public:
+  weak_ptr() {}
+  bool expired() const { return true; }
+};
+
+template <typename Y>
+struct auto_ptr_ref {
+  Y *ptr;
+};
+
+template <typename X>
+class auto_ptr {
+public:
+  typedef X element_type;
+  explicit auto_ptr(X *p = 0) throw() {}
+  auto_ptr(auto_ptr &a) throw() {}
+  template <typename Y>
+  auto_ptr(auto_ptr<Y> &a) throw() {}
+  auto_ptr &operator=(auto_ptr &a) throw() { return *this; }
+  template <typename Y>
+  auto_ptr &operator=(auto_ptr<Y> &a) throw() { return *this; }
+  auto_ptr &operator=(auto_ptr_ref<X> r) throw() { return *this; }
+  ~auto_ptr() throw() {}
+  auto_ptr(auto_ptr_ref<X> r) throw() {}
+  template <typename Y>
+  operator auto_ptr_ref<Y>() throw() {
+    auto_ptr_ref<Y> r;
+    r.ptr = ptr;
+    return r;
+  }
+  template <typename Y>
+  operator auto_ptr<Y>() throw() { return auto_ptr<Y>(ptr); }
+
+private:
+  X *ptr = nullptr;
+};
+
+template <>
+class auto_ptr<void> {
+public:
+  typedef void element_type;
+};
+
 template <typename T>
 class allocator {};
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/type_traits b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/type_traits
similarity index 94%
rename from clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/type_traits
rename to clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/type_traits
index c97ae9c2d14bd..e9b6fa76cb6ff 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/type_traits
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/type_traits
@@ -1,4 +1,4 @@
-#include "cstddef.h"
+#include "cstddef"
 
 namespace std {
 
@@ -94,6 +94,20 @@ struct remove_cv<const volatile T> {
 template <class T>
 using remove_cv_t = typename remove_cv<T>::type;
 
+template <class T>
+struct remove_const { using type = T; };
+template <class T>
+struct remove_const<const T> { using type = T; };
+template <class T>
+using remove_const_t = typename remove_const<T>::type;
+
+template <class T>
+struct add_cv { typedef const volatile T type; };
+template <class T>
+struct add_const { typedef const T type; };
+template <class T>
+struct add_volatile { typedef volatile T type; };
+
 template <class T>
 struct decay {
  private:
@@ -123,6 +137,9 @@ struct is_same : false_type {};
 template <class T>
 struct is_same<T, T> : true_type {};
 
+template <class T, class U>
+inline constexpr bool is_same_v = is_same<T, U>::value;
+
 template <class T>
 struct is_void : is_same<void, typename remove_cv<T>::type> {};
 
@@ -194,6 +211,13 @@ using is_constructible = is_constructible_<void_t<>, T, Args...>;
 template <class T, class... Args>
 inline constexpr bool is_constructible_v = is_constructible<T, Args...>::value;
 
+template <class T>
+struct remove_cvref {
+  using type = remove_cv_t<remove_reference_t<T>>;
+};
+template <class T>
+using remove_cvref_t = typename remove_cvref<T>::type;
+
 template <class _Tp>
 struct __uncvref {
   typedef typename remove_cv<typename remove_reference<_Tp>::type>::type type;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/utility b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/utility
index e87465118ab2a..deca0c71e2edf 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/utility
+++ b/clang-tools-extra/test/clang-tidy/checkers/Inputs/Headers/std/utility
@@ -1,20 +1,9 @@
 #ifndef _UTILITY_
 #define _UTILITY_
 
-namespace std {
-
-typedef __SIZE_TYPE__ size_t;
-typedef decltype(nullptr) nullptr_t;
+#include <type_traits>
 
-template <typename T>
-struct remove_reference { typedef T type; };
-template <typename T>
-struct remove_reference<T &> { typedef T type; };
-template <typename T>
-struct remove_reference<T &&> { typedef T type; };
-
-template <typename T>
-using remove_reference_t = typename remove_reference<T>::type;
+namespace std {
 
 template <typename _Tp>
 constexpr typename std::remove_reference<_Tp>::type &&move(_Tp &&__t) {
@@ -38,33 +27,6 @@ void swap(T &a, T &b) {
   b = move(tmp);
 }
 
-template <class T, class U>
-struct is_same { static constexpr bool value = false; };
-template <class T>
-struct is_same<T, T> { static constexpr bool value = true; };
-template <class T, class U>
-constexpr bool is_same_v = is_same<T, U>::value;
-
-template <bool B, class T = void>
-struct enable_if {};
-template <class T>
-struct enable_if<true, T> { typedef T type; };
-template <bool B, class T = void>
-using enable_if_t = typename enable_if<B, T>::type;
-
-template <class T> struct remove_const { using type = T; };
-template <class T> struct remove_const<const T> { using type = T; };
-template <class T> using remove_const_t = typename remove_const<T>::type;
-
-template <class T> struct remove_cv { using type = T; };
-template <class T> struct remove_cv<const T> { using type = T; };
-template <class T> struct remove_cv<volatile T> { using type = T; };
-template <class T> struct remove_cv<const volatile T> { using type = T; };
-template <class T> using remove_cv_t = typename remove_cv<T>::type;
-
-template <class T> struct remove_cvref { using type = remove_cv_t<remove_reference_t<T>>; };
-template <class T> using remove_cvref_t = typename remove_cvref<T>::type;
-
 } // namespace std
 
 #endif // _UTILITY_
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/cstddef.h b/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/cstddef.h
deleted file mode 100644
index 633260f24f99b..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/cstddef.h
+++ /dev/null
@@ -1,10 +0,0 @@
-namespace std {
-
-typedef decltype(sizeof(char)) size_t;
-
-using nullptr_t = decltype(nullptr);
-
-} // namespace std
-
-typedef decltype(sizeof(char)) size_t;
-typedef decltype(sizeof(char*)) ptrdiff_t;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/initializer_list b/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/initializer_list
deleted file mode 100644
index 886a54fe217f4..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/initializer_list
+++ /dev/null
@@ -1,11 +0,0 @@
-
-namespace std {
-
-template <typename T>
-class initializer_list {
- public:
-  const T *a, *b;
-  initializer_list() noexcept;
-};
-
-} // namespace std
\ No newline at end of file
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/shared-ptr-array-mismatch.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/shared-ptr-array-mismatch.cpp
index 70449e6bfc24c..dab7ef0d071f5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/shared-ptr-array-mismatch.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/shared-ptr-array-mismatch.cpp
@@ -1,16 +1,6 @@
 // RUN: %check_clang_tidy %s bugprone-shared-ptr-array-mismatch %t
 
-namespace std {
-
-template <typename T>
-struct shared_ptr {
-  template <class Y>
-  explicit shared_ptr(Y *) {}
-  template <class Y, class Deleter>
-  shared_ptr(Y *, Deleter) {}
-};
-
-} // namespace std
+#include <memory>
 
 struct A {};
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
index 0386c9bfda359..c0a65f3cb9bef 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unhandled-self-assignment.cpp
@@ -1,27 +1,10 @@
 // RUN: %check_clang_tidy %s bugprone-unhandled-self-assignment %t -- -- -fno-delayed-template-parsing
 
 #include <utility>
+#include <memory>
 
 namespace std {
 
-template <typename T> class default_delete {};
-
-template <class T, typename Deleter = std::default_delete<T>>
-class unique_ptr {
-};
-
-template <class T>
-class shared_ptr {
-};
-
-template <class T>
-class weak_ptr {
-};
-
-template <class T>
-class auto_ptr {
-};
-
 namespace pmr {
     template <typename TYPE = void>
     class allocator {};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unique-ptr-array-mismatch.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unique-ptr-array-mismatch.cpp
index 494e83dce3720..7076461497fc3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unique-ptr-array-mismatch.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unique-ptr-array-mismatch.cpp
@@ -1,27 +1,6 @@
 // RUN: %check_clang_tidy %s bugprone-unique-ptr-array-mismatch %t
 
-namespace std {
-
-template<class T> struct default_delete {};
-template<class T> struct default_delete<T[]> {};
-
-template<class T, class Deleter = std::default_delete<T>>
-class unique_ptr {
-public:
-  explicit unique_ptr(T* p) noexcept;
-  unique_ptr(T* p, Deleter d1 ) noexcept;
-};
-
-template <class T, class Deleter>
-class unique_ptr<T[], Deleter> {
-public:
-  template<class U>
-  explicit unique_ptr(U p) noexcept;
-  template<class U>
-  unique_ptr(U p, Deleter d1) noexcept;
-};
-
-} // namespace std
+#include <memory>
 
 struct A {};
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp
index 7ecacabef1a0b..3fa87b94dc6b4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp
@@ -1,6 +1,7 @@
 // RUN: %check_clang_tidy %s bugprone-unused-return-value %t -- \
 // RUN:   --config="{CheckOptions: {bugprone-unused-return-value.AllowCastToVoid: true}}" -- -fexceptions
 #include <vector>
+#include <memory>
 
 namespace std {
 
@@ -26,19 +27,6 @@ ForwardIt remove_if(ForwardIt, ForwardIt, UnaryPredicate);
 template <typename ForwardIt>
 ForwardIt unique(ForwardIt, ForwardIt);
 
-template <typename T>
-struct default_delete;
-
-template <typename T, typename Deleter = std::default_delete<T>>
-struct unique_ptr {
-  unique_ptr();
-  unique_ptr(unique_ptr const&);
-  unique_ptr(unique_ptr &&);
-  unique_ptr& operator=(unique_ptr const&);
-  unique_ptr& operator=(unique_ptr &&);
-  T *release() noexcept;
-};
-
 template <typename T>
 struct char_traits;
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp
index 983a7ec578c8d..5d95c44fc318f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/use-after-move.cpp
@@ -12,38 +12,12 @@
 // RUN:   -fno-delayed-template-parsing
 
 #include <utility>
+#include <memory>
 
 typedef decltype(nullptr) nullptr_t;
 
 namespace std {
 
-template <typename T>
-struct unique_ptr {
-  unique_ptr();
-  T *get() const;
-  explicit operator bool() const;
-  void reset(T *ptr);
-  T &operator*() const;
-  T *operator->() const;
-  T& operator[](size_t i) const;
-};
-
-template <typename T>
-struct shared_ptr {
-  shared_ptr();
-  T *get() const;
-  explicit operator bool() const;
-  void reset(T *ptr);
-  T &operator*() const;
-  T *operator->() const;
-};
-
-template <typename T>
-struct weak_ptr {
-  weak_ptr();
-  bool expired() const;
-};
-
 template <typename T>
 struct optional {
   optional();
@@ -224,7 +198,7 @@ void standardSmartPtr() {
     // CHECK-NOTES: [[@LINE-3]]:5: note: move occurred here
   }
   {
-    std::unique_ptr<A> ptr;
+    std::unique_ptr<A[]> ptr;
     std::move(ptr);
     ptr[0];
     // CHECK-NOTES: [[@LINE-1]]:5: warning: 'ptr' used after it was moved
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp
index 19da88300aec4..bd6e1ce301fd5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/avoid-const-or-ref-data-members.cpp
@@ -1,11 +1,6 @@
 // RUN: %check_clang_tidy %s cppcoreguidelines-avoid-const-or-ref-data-members %t
-namespace std {
-template <typename T>
-struct unique_ptr {};
 
-template <typename T>
-struct shared_ptr {};
-} // namespace std
+#include <memory>
 
 namespace gsl {
 template <typename T>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/multiple-inheritance.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/multiple-inheritance.cpp
index 6004ab3d812ea..257e16ab18f2f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/multiple-inheritance.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/multiple-inheritance.cpp
@@ -163,3 +163,22 @@ struct S2 { int i; };
 struct S3 : S1, S2 {};
 
 } // namespace N
+
+namespace M {
+
+class basic_ios { int state; };
+class ostream : virtual public basic_ios { int more_state; };
+class OStringStream final : public ostream {};
+
+struct A { int x; };
+struct B : A {};
+// CHECK-MESSAGES: [[@LINE+1]]:1: warning: inheriting multiple classes that aren't pure virtual is discouraged [misc-multiple-inheritance]
+struct C : A, B {};
+
+struct VA { virtual void f(); };
+struct VB : VA { virtual void g(); };
+struct VI : virtual VA { virtual void h() = 0; };
+// CHECK-MESSAGES: [[@LINE+1]]:1: warning: inheriting multiple classes that aren't pure virtual is discouraged [misc-multiple-inheritance]
+struct VD : VI, VB {};
+
+} // namespace M
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/uniqueptr-reset-release.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/uniqueptr-reset-release.cpp
index 629f55a96f3b8..f14598d2eb4b9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/uniqueptr-reset-release.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/uniqueptr-reset-release.cpp
@@ -2,21 +2,7 @@
 
 // CHECK-FIXES: #include <utility>
 
-namespace std {
-
-template <typename T>
-struct default_delete {};
-
-template <typename T, class Deleter = std::default_delete<T>>
-struct unique_ptr {
-  unique_ptr();
-  explicit unique_ptr(T *);
-  template <typename U, typename E>
-  unique_ptr(unique_ptr<U, E> &&);
-  void reset(T *);
-  T *release();
-};
-} // namespace std
+#include <memory>
 
 struct Foo {};
 struct Bar : Foo {};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/replace-auto-ptr/memory.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/replace-auto-ptr/memory.h
deleted file mode 100644
index bc476ced927a5..0000000000000
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/replace-auto-ptr/memory.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef INPUTS_MEMORY_H
-#define INPUTS_MEMORY_H
-
-namespace std {
-
-inline namespace _1 {
-
-template <class Y> struct auto_ptr_ref {
-  Y *y_;
-};
-
-template <class X> class auto_ptr {
-public:
-  typedef X element_type;
-  explicit auto_ptr(X *p = 0) throw() {}
-  auto_ptr(auto_ptr &) throw() {}
-  template <class Y> auto_ptr(auto_ptr<Y> &) throw() {}
-  auto_ptr &operator=(auto_ptr &) throw() { return *this; }
-  template <class Y> auto_ptr &operator=(auto_ptr<Y> &) throw() {
-    return *this;
-  }
-  auto_ptr &operator=(auto_ptr_ref<X> r) throw() { return *this; }
-  ~auto_ptr() throw() {}
-  auto_ptr(auto_ptr_ref<X> r) throw() : x_(r.y_) {}
-  template <class Y> operator auto_ptr_ref<Y>() throw() {
-    auto_ptr_ref<Y> r;
-    r.y_ = x_;
-    return r;
-  }
-  template <class Y> operator auto_ptr<Y>() throw() { return auto_ptr<Y>(x_); }
-
-private:
-  X *x_;
-};
-
-template <> class auto_ptr<void> {
-public:
-  typedef void element_type;
-};
-
-} // namespace _1
-
-} // end namespace std
-
-#endif // INPUTS_MEMORY_H
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h
index 337cb28228b09..ef00360c87d72 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/shared_ptr.h
@@ -1,33 +1 @@
-namespace std {
-
-template <typename type>
-class __shared_ptr {
-protected:
-  __shared_ptr();
-  __shared_ptr(type *ptr);
-  ~__shared_ptr();
-public:
-  type &operator*() { return *ptr; }
-  type *operator->() { return ptr; }
-  type *release();
-  void reset();
-  void reset(type *pt);
-
-private:
-  type *ptr;
-};
-
-template <typename type>
-class shared_ptr : public __shared_ptr<type> {
-public:
-  shared_ptr();
-  shared_ptr(type *ptr);
-  shared_ptr(const shared_ptr<type> &t);
-  shared_ptr(shared_ptr<type> &&t);
-  ~shared_ptr();
-  shared_ptr &operator=(shared_ptr &&);
-  template <typename T>
-  shared_ptr &operator=(shared_ptr<T> &&);
-};
-
-}  // namespace std
+#include <memory>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/unique_ptr.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/unique_ptr.h
index 5dc9e02b637a2..ef00360c87d72 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/unique_ptr.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/smart-ptr/unique_ptr.h
@@ -1,28 +1 @@
-namespace std {
-
-template <typename T>
-class default_delete {};
-
-template <typename type, typename Deleter = std::default_delete<type>>
-class unique_ptr {
-public:
-  unique_ptr() {}
-  unique_ptr(type *ptr) {}
-  unique_ptr(const unique_ptr<type> &t) = delete;
-  unique_ptr(unique_ptr<type> &&t) {}
-  ~unique_ptr() {}
-  type &operator*() { return *ptr; }
-  type *operator->() { return ptr; }
-  type *release() { return ptr; }
-  void reset() {}
-  void reset(type *pt) {}
-  void reset(type pt) {}
-  unique_ptr &operator=(unique_ptr &&) { return *this; }
-  template <typename T>
-  unique_ptr &operator=(unique_ptr<T> &&) { return *this; }
-
-private:
-  type *ptr;
-};
-
-}  // namespace std
+#include <memory>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared-header.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared-header.cpp
index 0e95d070ae55e..65bf830fd3142 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared-header.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-shared-header.cpp
@@ -2,10 +2,9 @@
 // RUN:   -config="{CheckOptions: \
 // RUN:     {modernize-make-shared.MakeSmartPtrFunction: 'my::MakeShared', \
 // RUN:      modernize-make-shared.MakeSmartPtrFunctionHeader: 'make_shared_util.h' \
-// RUN:     }}" \
-// RUN:   -- -I %S/Inputs/smart-ptr
+// RUN:     }}"
 
-#include "shared_ptr.h"
+#include <memory>
 // CHECK-FIXES: #include "make_shared_util.h"
 
 void f() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-cxx11.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-cxx11.cpp
index e2944b8080c53..539943e7ba749 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-cxx11.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-cxx11.cpp
@@ -1,7 +1,7 @@
-// RUN: %check_clang_tidy -std=c++11 %s modernize-make-unique %t -- -- -I %S/Inputs/smart-ptr
+// RUN: %check_clang_tidy -std=c++11 %s modernize-make-unique %t
 
-#include "unique_ptr.h"
-// CHECK-FIXES: #include "unique_ptr.h"
+#include <memory>
+// CHECK-FIXES: #include <memory>
 
 void f() {
   auto my_ptr = std::unique_ptr<int>(new int(1));
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-default-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-default-init.cpp
index 50e7beda68a43..aec7189fb2b51 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-default-init.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-default-init.cpp
@@ -2,12 +2,10 @@
 // RUN:   -config="{CheckOptions: \
 // RUN:             {modernize-make-unique.IgnoreDefaultInitialization: \
 // RUN:              'false'}} \
-// RUN:             }" \
-// RUN:   -- -I %S/Inputs/smart-ptr
+// RUN:             }"
 
-#include "unique_ptr.h"
+#include <memory>
 #include <vector>
-// CHECK-FIXES: #include <memory>
 
 void basic() {
   std::unique_ptr<int> P1 = std::unique_ptr<int>(new int());
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-header.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-header.cpp
index 5ffd9483a146a..d58f52c06194e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-header.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-header.cpp
@@ -2,10 +2,9 @@
 // RUN:   -config="{CheckOptions: \
 // RUN:     {modernize-make-unique.MakeSmartPtrFunction: 'my::MakeUnique', \
 // RUN:      modernize-make-unique.MakeSmartPtrFunctionHeader: 'make_unique_util.h' \
-// RUN:     }}" \
-// RUN:   -- -I %S/Inputs/smart-ptr
+// RUN:     }}"
 
-#include "unique_ptr.h"
+#include <memory>
 // CHECK-FIXES: #include "make_unique_util.h"
 
 void f() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-macros.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-macros.cpp
index 78beb911f5a0a..e75daf9938c73 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-macros.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/make-unique-macros.cpp
@@ -2,7 +2,7 @@
 // RUN:   -config="{CheckOptions: {modernize-make-unique.IgnoreMacros: false}}" \
 // RUN:   -- -I %S/Inputs/smart-ptr
 
-#include "unique_ptr.h"
+#include <memory>
 
 class Foo {};
 class Bar {};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
index 371f3ddf6d650..68c961d92d2dd 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/replace-auto-ptr.cpp
@@ -1,8 +1,8 @@
-// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t -- -- -isystem %S/Inputs/replace-auto-ptr
+// RUN: %check_clang_tidy %s modernize-replace-auto-ptr %t
 
 // CHECK-FIXES: #include <utility>
 
-#include "memory.h"
+#include <memory>
 
 // Instrumentation for auto_ptr_ref test.
 struct Base {};
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
index 7d88c1be24747..bed5c88ed47d8 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
@@ -8,6 +8,7 @@
 // RUN:                '::std::make_pair; ::std::make_tuple; ::test::MakeSingle'}}"
 
 #include <utility>
+#include <memory>
 
 namespace std {
 template <typename E>
@@ -313,12 +314,6 @@ tuple<typename remove_reference<Ts>::type...> make_tuple(Ts &&...) {
   return {};
 }
 
-template <typename T>
-class unique_ptr {
-public:
-  explicit unique_ptr(T *) {}
-  ~unique_ptr();
-};
 } // namespace std
 
 namespace llvm {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp
index 21e6c32720163..80b054b74b49a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-ranges.cpp
@@ -6,7 +6,7 @@
 // CHECK-FIXES: #include <ranges>
 
 #include "use-ranges/fake_std.h"
-#include "smart-ptr/unique_ptr.h"
+#include <memory>
 
 void Positives() {
   std::vector<int> I, J;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp
index 72080ed39e59b..adc37e4c4bedf 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp
@@ -32,5 +32,11 @@ int main() {
     f(mystr2 + mystr1);
     mystr1 = g(mystr1);
   }
+
+  do {
+    mystr1 = mystr1 + mystr2;
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: string concatenation results in allocation of unnecessary temporary strings; consider using 'operator+=' or 'string::append()' instead
+  } while (0);
+
   return 0;
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/move-constructor-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/move-constructor-init.cpp
index b8395c1eca7e1..ef42e69b37829 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/move-constructor-init.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/move-constructor-init.cpp
@@ -2,17 +2,17 @@
 // RUN: -config='{CheckOptions: \
 // RUN:  {modernize-pass-by-value.ValuesOnly: true}}' -- -isystem %S/../Inputs/Headers
 #include <s.h>
+#include <type_traits>
 
 // CHECK-FIXES: #include <utility>
 
-template <class T> struct remove_reference      {typedef T type;};
-template <class T> struct remove_reference<T&>  {typedef T type;};
-template <class T> struct remove_reference<T&&> {typedef T type;};
 
+namespace std {
 template <typename T>
 typename remove_reference<T>::type&& move(T&& arg) {
   return static_cast<typename remove_reference<T>::type&&>(arg);
 }
+} // namespace std
 
 struct C {
   C() = default;
@@ -37,7 +37,7 @@ struct D : B {
 struct E : B {
   E() : B() {}
   E(const E &RHS) : B(RHS) {}
-  E(E &&RHS) : B(move(RHS)) {} // ok
+  E(E &&RHS) : B(std::move(RHS)) {} // ok
 };
 
 struct F {
@@ -81,7 +81,7 @@ struct M {
 
 struct N {
   B Mem;
-  N(N &&RHS) : Mem(move(RHS.Mem)) {}
+  N(N &&RHS) : Mem(std::move(RHS.Mem)) {}
 };
 
 struct O {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/use-std-move.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/use-std-move.cpp
index c7014859adf50..87a5c90030d8f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/use-std-move.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/use-std-move.cpp
@@ -289,6 +289,21 @@ void NonConvertibleNonTrivialMoveAssignInLoop(NonTrivialMoveAssign& target, NonT
     target = source;
 }
 
+// Check moving incomplete definition
+// ----------------------------------
+
+struct fwd_cls;
+struct fwd_cls {
+  void ConvertibleNonTrivialMoveAssignReferecingForwardDecl(fwd_cls src) {
+    // CHECK-MESSAGES: [[@LINE+2]]:13: warning: 'src' could be moved here [performance-use-std-move]
+    // CHECK-FIXES: *this = std::move(src);
+    *this = src;
+  }
+  fwd_cls &operator=(const fwd_cls &C);
+  fwd_cls &operator=(fwd_cls &&);
+};
+
+
 // Check moving for invalid / non profitable type or operation
 // -----------------------------------------------------------
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call-custom-pointers.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call-custom-pointers.cpp
index df3f16a9cf9ec..679ba48c6d432 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call-custom-pointers.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call-custom-pointers.cpp
@@ -1,10 +1,9 @@
 // RUN: %check_clang_tidy %s readability-ambiguous-smartptr-reset-call %t -- \
 // RUN: -config='{CheckOptions: \
 // RUN:  {readability-ambiguous-smartptr-reset-call.SmartPointers: "::std::unique_ptr;::other_ptr"}}' \
-// RUN: --fix-notes -- -I %S/../modernize/Inputs/smart-ptr
+// RUN: --fix-notes
 
-#include "unique_ptr.h"
-#include "shared_ptr.h"
+#include <memory>
 
 template <typename T>
 struct other_ptr {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call.cpp
index e6e7eb9231ec2..1e7bfa0df5e38 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/ambiguous-smartptr-reset-call.cpp
@@ -1,7 +1,6 @@
-// RUN: %check_clang_tidy %s readability-ambiguous-smartptr-reset-call %t --fix-notes -- -I %S/../modernize/Inputs/smart-ptr
+// RUN: %check_clang_tidy %s readability-ambiguous-smartptr-reset-call %t --fix-notes
 
-#include "unique_ptr.h"
-#include "shared_ptr.h"
+#include <memory>
 
 template <typename T>
 struct non_default_reset_ptr {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
index d78ea345b3560..c6e02bd0fe990 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
@@ -3,14 +3,7 @@
 //  p# = positive test
 //  n# = negative test
 
-namespace std {
-template< class T >
-struct add_cv { typedef const volatile T type; };
-
-template< class T> struct add_const { typedef const T type; };
-
-template< class T> struct add_volatile { typedef volatile T type; };
-}
+#include <type_traits>
 
 const int p1() {
 // CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'const int' is 'const'-qualified at the top level, which may reduce code readability without improving const correctness
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp
index 70ade83eed0f0..4fd228a554d7d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-data-pointer.cpp
@@ -2,27 +2,9 @@
 // RUN: %check_clang_tidy -check-suffixes=,WITH-CONFIG %s readability-container-data-pointer %t -- -config="{CheckOptions: {readability-container-data-pointer.IgnoredContainers: '::std::basic_string'}}" -- -fno-delayed-template-parsing
 
 #include <string>
+#include <type_traits>
 #include <vector>
-#include <utility>
-
-typedef __SIZE_TYPE__ size_t;
-
-namespace std {
-
-template <typename T>
-struct is_integral;
-
-template <>
-struct is_integral<size_t> {
-  static const bool value = true;
-};
-
-template <typename T>
-struct unique_ptr {
-  T &operator*() const;
-  T *operator->() const;
-};
-}
+#include <memory>
 
 template <typename T>
 void f(const T *);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp
index 93dc00845290d..2b8b3261ac765 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp
@@ -2,6 +2,7 @@
 // RUN: -config="{CheckOptions: {readability-container-size-empty.ExcludedComparisonTypes: '::std::array;::IgnoredDummyType'}}" \
 // RUN: -- -fno-delayed-template-parsing
 #include <string>
+#include <memory>
 
 namespace std {
 template <typename T> struct vector {
@@ -682,14 +683,6 @@ void instantiator() {
   instantiatedTemplateWithSizeCall<std::vector<int>>();
 }
 
-namespace std {
-template <typename T>
-struct unique_ptr {
-  T *operator->() const;
-  T &operator*() const;
-};
-} // namespace std
-
 bool call_through_unique_ptr(const std::unique_ptr<std::vector<int>> &ptr) {
   return ptr->size() > 0;
   // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: the 'empty' method should be used
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-cxx20.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-cxx20.cpp
index 589d2b6e2bb68..c83b48dcfdb8d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-cxx20.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return-cxx20.cpp
@@ -38,4 +38,26 @@ void f() {
     // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: do not use 'else' after 'return'
     // CHECK-FIXES: {{^}}  {{[[][[]}}unlikely{{[]][]]}} // comment-4
     g();
+
+  if (false)
+    [[clang::musttail]] return f();
+  else // comment-5
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: do not use 'else' after 'return'
+    // CHECK-FIXES: {{^}}  // comment-5
+    g();
+
+  if (false) [[likely]]
+    [[clang::musttail]] return f();
+  else // comment-6
+    // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: do not use 'else' after 'return'
+    // CHECK-FIXES: {{^}}  // comment-6
+    g();
+
+  if (false) [[likely]] {
+    [[clang::musttail]] return f();
+  } else { // comment-7
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not use 'else' after 'return'
+    // CHECK-FIXES: {{^}}  } // comment-7
+    g();
+  }
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return.cpp
index 7ab5acfe9d966..1bbbdbc2a5683 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/else-after-return.cpp
@@ -437,6 +437,45 @@ void testLabels(bool b) {
       // CHECK-FIXES: {{^}} // comment-27
       f(0);
   }
+
+  if (true) {
+    goto skip_over_return;
+    return;
+skip_over_return:
+    f(0);
+  } else {
+    f(0);
+  }
+
+  if (true) {
+    goto skip_over_return2;
+    return;
+skip_over_return2:
+    // No statement after label. Valid since C++23/C23.
+  } else {
+    f(0);
+  }
+
+  if (true) {
+    goto skip_over_return3;
+    return;
+skip_over_return3:
+    return;
+  } else { // comment-28
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not use 'else' after 'return'
+    // CHECK-FIXES: {{^}}  } // comment-28
+    f(0);
+  }
+}
+
+void testExcessiveBracing() {
+  if (false) {
+    {{{ return; }}}
+  } else { // comment-29
+  // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not use 'else' after 'return'
+  // CHECK-FIXES: {{^}}  } // comment-29
+    return;
+  }
 }
 
 [[noreturn]] void noReturn();
@@ -448,18 +487,18 @@ struct NoReturnMember {
 void testNoReturn() {
   if (true) {
     noReturn();
-  } else { // comment-28
+  } else { // comment-30
     // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not use 'else' after calling a function that doesn't return
-    // CHECK-FIXES: {{^}}  } // comment-28
+    // CHECK-FIXES: {{^}}  } // comment-30
     f(0);
   }
 
   if (true) {
     NoReturnMember f;
     f.noReturn();
-  } else { // comment-29
+  } else { // comment-31
     // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not use 'else' after calling a function that doesn't return
-    // CHECK-FIXES: {{^}}  } // comment-29
+    // CHECK-FIXES: {{^}}  } // comment-31
     f(0);
   }
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-allow-in-conditions.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-allow-in-conditions.cpp
index ef591940917cd..3c9e14449e366 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-allow-in-conditions.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion-allow-in-conditions.cpp
@@ -3,11 +3,14 @@
 // RUN:  {readability-implicit-bool-conversion.AllowIntegerConditions: true, \
 // RUN:   readability-implicit-bool-conversion.AllowPointerConditions: true}}'
 
+#include <string>
+
 template<typename T>
 void functionTaking(T);
 
 int functionReturningInt();
 int* functionReturningPointer();
+void* functionReturningPointerWithStringArg(const std::string&);
 
 struct Struct {
   int member;
@@ -74,4 +77,9 @@ void implicitConversionPointerToBoolInConditionalsIsAllowed() {
   if (memberPointer) {}
   int value3 = memberPointer ? 1 : 2;
   int value4 = (not memberPointer) ? 1 : 2;
+
+  // Passing a string literal creates a temporary std::string, which causes
+  // Clang to wrap the condition in ExprWithCleanups. This should still be
+  // allowed when AllowPointerConditions is true.
+  if (functionReturningPointerWithStringArg("input")) {}
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-qualified-alias.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-qualified-alias.cpp
new file mode 100644
index 0000000000000..63fd558084370
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-qualified-alias.cpp
@@ -0,0 +1,203 @@
+// RUN: %check_clang_tidy -std=c++11-or-later %s readability-redundant-qualified-alias %t
+// RUN: %check_clang_tidy -check-suffix=NS -std=c++11-or-later %s readability-redundant-qualified-alias %t -- \
+// RUN:   -config='{CheckOptions: { readability-redundant-qualified-alias.OnlyNamespaceScope: true }}'
+// RUN: %check_clang_tidy -check-suffixes=,CXX23 -std=c++23-or-later %s readability-redundant-qualified-alias %t
+// RUN: %check_clang_tidy -check-suffixes=NS,NS-CXX23 -std=c++23-or-later %s readability-redundant-qualified-alias %t -- \
+// RUN:   -config='{CheckOptions: { readability-redundant-qualified-alias.OnlyNamespaceScope: true }}'
+
+namespace n1 {
+struct Foo {};
+struct Bar {};
+struct Attr {};
+enum PlainEnum { V0 };
+enum class ScopedEnum { V1 };
+struct Commented {};
+struct AfterType {};
+struct Elab {};
+struct MacroEq {};
+struct MacroType {};
+struct PtrType {};
+struct LocalType {};
+} // namespace n1
+
+namespace n2 {
+namespace n3 {
+struct Deep {};
+} // namespace n3
+} // namespace n2
+
+namespace td {
+typedef n1::Foo TypedefFoo;
+} // namespace td
+
+struct GlobalType {};
+struct Outer {
+  struct Inner {};
+};
+
+using Foo = n1::Foo;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead [readability-redundant-qualified-alias]
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead [readability-redundant-qualified-alias]
+// CHECK-FIXES: using n1::Foo;
+
+using Bar = ::n1::Bar;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using ::n1::Bar;
+
+using Attr = n1::Attr __attribute__((aligned(8)));
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+namespace alias_attr {
+using Foo [[deprecated("alias attr")]] = n1::Foo;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+} // namespace alias_attr
+
+using Deep = n2::n3::Deep;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using n2::n3::Deep;
+
+using TypedefFoo = td::TypedefFoo;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using td::TypedefFoo;
+
+using GlobalType = ::GlobalType;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using ::GlobalType;
+
+using PlainEnum = n1::PlainEnum;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using n1::PlainEnum;
+
+using ScopedEnum = n1::ScopedEnum;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using n1::ScopedEnum;
+
+using Inner = Outer::Inner;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+using Builtin = int;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+using PtrType = n1::PtrType *;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+namespace templ {
+template <typename T>
+struct Vec {};
+} // namespace templ
+
+using Vec = templ::Vec<int>;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+namespace templ_alias {
+template <typename T>
+using Foo = n1::Foo;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+} // namespace templ_alias
+
+template <typename T>
+struct Dependent {
+  using X = typename T::X;
+  // CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+};
+
+using Elab = class n1::Elab;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+using Commented /*comment*/ = n1::Commented;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using{{[ ]+}}/*comment*/{{[ ]+}}n1::Commented;
+
+using AfterType = n1::AfterType /*rhs-comment*/;
+// CHECK-MESSAGES: :[[@LINE-1]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS: :[[@LINE-2]]:7: warning: type alias is redundant; use a using-declaration instead
+// CHECK-FIXES: using n1::AfterType /*rhs-comment*/;
+
+#define DECL_END ;
+using MacroDeclEnd = n1::MacroType DECL_END
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+#define ALIAS MacroType
+using ALIAS = n1::MacroType;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+#define RHS n1::MacroType
+using MacroType = RHS;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+#define EQ =
+using MacroEq EQ n1::MacroEq;
+// CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+// CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+
+struct Base {
+  using T = n1::Foo;
+};
+
+struct Derived : Base {
+  using T = Base::T;
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-FIXES: using Base::T;
+};
+
+struct ClassScopeNamespaceAlias {
+  using Foo = n1::Foo;
+  // CHECK-MESSAGES-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+};
+
+void local_scope() {
+  using LocalType = n1::LocalType;
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-FIXES: using n1::LocalType;
+}
+
+#if __cplusplus >= 202302L
+void cxx23_init_statement_scope(bool Cond) {
+  if (using Foo = n1::Foo; Cond) {
+  }
+  // CHECK-MESSAGES-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+
+  switch (using Bar = ::n1::Bar; 0) {
+  default:
+    break;
+  }
+  // CHECK-MESSAGES-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+
+  for (using Deep = n2::n3::Deep; Cond;) {
+    Cond = false;
+  }
+  // CHECK-MESSAGES-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+
+  int Values[] = {0};
+  for (using GlobalType = ::GlobalType; int V : Values) {
+    (void)V;
+  }
+  // CHECK-MESSAGES-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+  // CHECK-MESSAGES-NS-CXX23-NOT: warning: type alias is redundant; use a using-declaration instead
+}
+#endif // __cplusplus >= 202302L
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get-macros.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get-macros.cpp
index 4c8bb84414355..05b52a67bfc87 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get-macros.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get-macros.cpp
@@ -1,17 +1,7 @@
 // RUN: %check_clang_tidy %s readability-redundant-smartptr-get %t -- \
 // RUN:   -config="{CheckOptions: {readability-redundant-smartptr-get.IgnoreMacros: false}}"
 
-namespace std {
-
-template <typename T>
-struct shared_ptr {
-  T &operator*() const;
-  T *operator->() const;
-  T *get() const;
-  explicit operator bool() const noexcept;
-};
-
-} // namespace std
+#include <memory>
 
 #define MACRO(p) p.get()
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
index 2d88281eb8524..b74d28f4873bb 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-smartptr-get.cpp
@@ -1,42 +1,9 @@
 // RUN: %check_clang_tidy %s readability-redundant-smartptr-get %t
 #include <vector>
+#include <memory>
 
 #define NULL __null
 
-namespace std {
-
-template <typename T>
-struct unique_ptr {
-  T& operator*() const;
-  T* operator->() const;
-  T* get() const;
-  explicit operator bool() const noexcept;
-};
-
-template <typename T>
-struct unique_ptr<T[]> {
-  T& operator[](unsigned) const;
-  T* get() const;
-  explicit operator bool() const noexcept;
-};
-
-template <typename T>
-struct shared_ptr {
-  T& operator*() const;
-  T* operator->() const;
-  T* get() const;
-  explicit operator bool() const noexcept;
-};
-
-template <typename T>
-struct shared_ptr<T[]> {
-  T& operator[](unsigned) const;
-  T* get() const;
-  explicit operator bool() const noexcept;
-};
-
-}  // namespace std
-
 struct Bar {
   void Do();
   void ConstDo() const;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/uniqueptr-delete-release.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/uniqueptr-delete-release.cpp
index b4695394f6be8..0742b970e7729 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/uniqueptr-delete-release.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/uniqueptr-delete-release.cpp
@@ -1,24 +1,8 @@
 // RUN: %check_clang_tidy %s readability-uniqueptr-delete-release %t -check-suffix=NULLPTR
 // RUN: %check_clang_tidy %s readability-uniqueptr-delete-release %t -check-suffix=RESET -config='{ \
 // RUN: CheckOptions: {readability-uniqueptr-delete-release.PreferResetCall: true}}'
-namespace std {
-template <typename T>
-struct default_delete {};
-
-template <typename T, typename D = default_delete<T>>
-class unique_ptr {
- public:
-  unique_ptr();
-  ~unique_ptr();
-  explicit unique_ptr(T*);
-  template <typename U, typename E>
-  unique_ptr(unique_ptr<U, E>&&);
-  T* release();
-  void reset(T *P = nullptr);
-  T &operator*() const;
-  T *operator->() const;
-};
-}  // namespace std
+
+#include <memory>
 
 std::unique_ptr<int>& ReturnsAUnique();
 
@@ -30,7 +14,7 @@ void Positives() {
   // CHECK-FIXES-NULLPTR: P = nullptr;
   // CHECK-FIXES-RESET: P.reset();
 
-  auto P2 = P;
+  auto &P2 = P;
   delete P2.release();
   // CHECK-MESSAGES-NULLPTR: :[[@LINE-1]]:3: warning: prefer '= nullptr' to reset 'unique_ptr<>' objects
   // CHECK-MESSAGES-RESET: :[[@LINE-2]]:3: warning: prefer 'reset()' to reset 'unique_ptr<>' objects
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index f0d10603374b9..b3bc56209fc9e 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -229,6 +229,44 @@ if(GCC_INSTALL_PREFIX AND NOT USE_DEPRECATED_GCC_INSTALL_PREFIX)
     "See https://github.com/llvm/llvm-project/pull/77537 for detail.")
 endif()
 
+cmake_dependent_option(CLANG_USE_XCSELECT "Use libxcselect to find the macOS SDK." OFF "APPLE" OFF)
+
+if(CLANG_USE_XCSELECT)
+  if(DEFAULT_SYSROOT)
+    message(FATAL_ERROR "Setting DEFAULT_SYSROOT is incompatible with CLANG_USE_XCSELECT.")
+  endif()
+
+  check_include_file(xcselect.h CLANG_HAVE_XCSELECT_H)
+  if(NOT CLANG_HAVE_XCSELECT_H)
+    message(FATAL_ERROR "CLANG_USE_XCSELECT is enabled but xcselect.h was not found.")
+  endif()
+
+  include(CheckSymbolExists)
+  list(APPEND CMAKE_REQUIRED_LIBRARIES xcselect)
+  check_symbol_exists(xcselect_host_sdk_path xcselect.h CLANG_HAVE_XCSELECT_HOST_SDK_PATH)
+  list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES xcselect)
+
+  if(NOT CLANG_HAVE_XCSELECT_HOST_SDK_PATH)
+    message(FATAL_ERROR "CLANG_USE_XCSELECT is enabled but either libxcselect is not available "
+      "or it is missing xcselect_host_sdk_path.")
+  endif()
+
+  set(XCSELECT_VALID_POLICIES LATEST MATCHING_ONLY MATCHING_PREFERRED)
+  set(CLANG_XCSELECT_HOST_SDK_POLICY "LATEST" CACHE STRING
+    "Policy to use for xcselect. One of: ${XCSELECT_VALID_POLICIES}")
+  set_property(CACHE CLANG_XCSELECT_HOST_SDK_POLICY PROPERTY STRINGS ${XCSELECT_VALID_POLICIES})
+  string(TOUPPER ${CLANG_XCSELECT_HOST_SDK_POLICY} CLANG_XCSELECT_HOST_SDK_POLICY)
+  list(JOIN XCSELECT_VALID_POLICIES "|" XCSELECT_POLICY_REGEX)
+
+  if(NOT CLANG_XCSELECT_HOST_SDK_POLICY MATCHES "^XCSELECT_HOST_SDK_POLICY_(${XCSELECT_POLICY_REGEX})$")
+    if(NOT CLANG_XCSELECT_HOST_SDK_POLICY IN_LIST XCSELECT_VALID_POLICIES)
+      message(FATAL_ERROR
+        "CLANG_XCSELECT_HOST_SDK_POLICY (${CLANG_XCSELECT_HOST_SDK_POLICY}) must be one of: ${XCSELECT_VALID_POLICIES}")
+    endif()
+    set(CLANG_XCSELECT_HOST_SDK_POLICY "XCSELECT_HOST_SDK_POLICY_${CLANG_XCSELECT_HOST_SDK_POLICY}")
+  endif()
+endif()
+
 set(ENABLE_LINKER_BUILD_ID OFF CACHE BOOL "pass --build-id to ld")
 
 set(ENABLE_X86_RELAX_RELOCATIONS ON CACHE BOOL
diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index e869afbcaef6f..342eb0bdb1279 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -1899,15 +1899,21 @@ the configuration (without a prefix: ``Auto``).
   Dependent on the value, ``int f() { return 0; }`` can be put on a
   single line.
 
-  Possible values:
+  Nested configuration flags:
+
+  Different styles for merging short functions containing at most one
+  statement.
+
+  They can be read as a whole for compatibility. The choices are:
 
-  * ``SFS_None`` (in configuration: ``None``)
+  * ``None``
     Never merge functions into a single line.
 
-  * ``SFS_InlineOnly`` (in configuration: ``InlineOnly``)
+  * ``InlineOnly``
     Only merge functions defined inside a class. Same as ``inline``,
-    except it does not imply ``empty``: i.e. top level empty functions
-    are not merged either.
+    except it does not implies ``empty``: i.e. top level empty functions
+    are not merged either. This option is **deprecated** and is retained
+    for backwards compatibility. See ``Inline`` of ``ShortFunctionStyle``.
 
     .. code-block:: c++
 
@@ -1920,8 +1926,10 @@ the configuration (without a prefix: ``Auto``).
       void f() {
       }
 
-  * ``SFS_Empty`` (in configuration: ``Empty``)
-    Only merge empty functions.
+  * ``Empty``
+    Only merge empty functions. This option is **deprecated** and is
+    retained for backwards compatibility. See ``Empty`` of
+    ``ShortFunctionStyle``.
 
     .. code-block:: c++
 
@@ -1930,8 +1938,10 @@ the configuration (without a prefix: ``Auto``).
         bar2();
       }
 
-  * ``SFS_Inline`` (in configuration: ``Inline``)
-    Only merge functions defined inside a class. Implies ``empty``.
+  * ``Inline``
+    Only merge functions defined inside a class. Implies ``empty``. This
+    option is **deprecated** and is retained for backwards compatibility.
+    See ``Inline`` and ``Empty`` of ``ShortFunctionStyle``.
 
     .. code-block:: c++
 
@@ -1943,7 +1953,7 @@ the configuration (without a prefix: ``Auto``).
       }
       void f() {}
 
-  * ``SFS_All`` (in configuration: ``All``)
+  * ``All``
     Merge all functions fitting on a single line.
 
     .. code-block:: c++
@@ -1953,6 +1963,52 @@ the configuration (without a prefix: ``Auto``).
       };
       void f() { bar(); }
 
+  Also can be specified as a nested configuration flag:
+
+  .. code-block:: c++
+
+    # Example of usage:
+    AllowShortFunctionsOnASingleLine: InlineOnly
+
+    # or more granular control:
+    AllowShortFunctionsOnASingleLine:
+      Empty: false
+      Inline: true
+      Other: false
+
+  * ``bool Empty`` Merge top-level empty functions.
+
+    .. code-block:: c++
+
+      void f() {}
+      void f2() {
+        bar2();
+      }
+      void f3() { /* comment */ }
+
+  * ``bool Inline`` Merge functions defined inside a class.
+
+    .. code-block:: c++
+
+      class Foo {
+        void f() { foo(); }
+        void g() {}
+      };
+      void f() {
+        foo();
+      }
+      void f() {
+      }
+
+  * ``bool Other`` Merge all functions fitting on a single line. Please note that this
+    control does not include Empty
+
+    .. code-block:: c++
+
+      class Foo {
+        void f() { foo(); }
+      };
+      void f() { bar(); }
 
 
 .. _AllowShortIfStatementsOnASingleLine:
@@ -2087,6 +2143,42 @@ the configuration (without a prefix: ``Auto``).
 **AllowShortNamespacesOnASingleLine** (``Boolean``) :versionbadge:`clang-format 20` :ref:`¶ <AllowShortNamespacesOnASingleLine>`
   If ``true``, ``namespace a { class b; }`` can be put on a single line.
 
+.. _AllowShortRecordOnASingleLine:
+
+**AllowShortRecordOnASingleLine** (``ShortRecordStyle``) :versionbadge:`clang-format 23` :ref:`¶ <AllowShortRecordOnASingleLine>`
+  Dependent on the value, ``struct bar { int i; };`` can be put on a single
+  line.
+
+  Possible values:
+
+  * ``SRS_Never`` (in configuration: ``Never``)
+    Never merge records into a single line.
+
+  * ``SRS_EmptyAndAttached`` (in configuration: ``EmptyAndAttached``)
+    Only merge empty records if the opening brace was not wrapped,
+    i.e. the corresponding ``BraceWrapping.After...`` option was not set.
+
+  * ``SRS_Empty`` (in configuration: ``Empty``)
+    Only merge empty records.
+
+    .. code-block:: c++
+
+      struct foo {};
+      struct bar
+      {
+        int i;
+      };
+
+  * ``SRS_Always`` (in configuration: ``Always``)
+    Merge all records that fit on a single line.
+
+    .. code-block:: c++
+
+      struct foo {};
+      struct bar { int i; };
+
+
+
 .. _AlwaysBreakAfterDefinitionReturnType:
 
 **AlwaysBreakAfterDefinitionReturnType** (``DefinitionReturnTypeBreakingStyle``) :versionbadge:`clang-format 3.7` :ref:`¶ <AlwaysBreakAfterDefinitionReturnType>`
diff --git a/clang/docs/LibTooling.rst b/clang/docs/LibTooling.rst
index 87d84321ab283..c6687fb9642f9 100644
--- a/clang/docs/LibTooling.rst
+++ b/clang/docs/LibTooling.rst
@@ -75,7 +75,7 @@ and automatic location of the compilation database using source files paths.
     auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
     if (!ExpectedParser) {
       // Fail gracefully for unsupported options.
-      llvm::errs() << ExpectedParser.takeError();
+      llvm::errs() << toString(ExpectedParser.takeError());
       return 1;
     }
     CommonOptionsParser& OptionsParser = ExpectedParser.get();
@@ -142,7 +142,7 @@ version of this example tool is also checked into the clang tree at
   int main(int argc, const char **argv) {
     auto ExpectedParser = CommonOptionsParser::create(argc, argv, MyToolCategory);
     if (!ExpectedParser) {
-      llvm::errs() << ExpectedParser.takeError();
+      llvm::errs() << toString(ExpectedParser.takeError());
       return 1;
     }
     CommonOptionsParser& OptionsParser = ExpectedParser.get();
diff --git a/clang/docs/LifetimeSafety.rst b/clang/docs/LifetimeSafety.rst
new file mode 100644
index 0000000000000..ac76231dede67
--- /dev/null
+++ b/clang/docs/LifetimeSafety.rst
@@ -0,0 +1,609 @@
+========================
+Lifetime Safety Analysis
+========================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+Clang Lifetime Safety Analysis is a C++ language extension which warns about
+potential dangling pointer defects in code. The analysis aims to detect
+when a pointer, reference or view type (such as ``std::string_view``) refers to an object
+that is no longer alive, a condition that leads to use-after-free bugs and
+security vulnerabilities. Common examples include pointers to stack variables
+that have gone out of scope, fields holding views to stack-allocated objects
+(dangling-field), returning pointers/references to stack variables 
+(return stack address) or iterators into container elements invalidated by
+container operations (e.g., ``std::vector::push_back``)
+
+The analysis design is inspired by `Polonius, the Rust borrow checker <https://github.com/rust-lang/polonius>`_,
+but adapted to C++ idioms and constraints, such as the lack of exclusivity enforcement (alias-xor-mutability). 
+Further details on the analysis method can be found in the `RFC on Discourse <https://discourse.llvm.org/t/rfc-intra-procedural-lifetime-analysis-in-clang/86291/>`_.
+
+This is compile-time analysis; there is no run-time overhead. 
+It tracks pointer validity through intra-procedural data-flow analysis. While it does
+not require lifetime annotations to get started, in their absence, the analysis
+treats function calls optimistically, assuming no lifetime effects, thereby potentially missing dangling pointer issues. As more functions are annotated
+with attributes like `clang::lifetimebound <https://clang.llvm.org/docs/AttributeReference.html#lifetimebound>`_, `gsl::Owner <https://clang.llvm.org/docs/AttributeReference.html#gsl-owner>`_, and
+`gsl::Pointer <https://clang.llvm.org/docs/AttributeReference.html#gsl-pointer>`_, the analysis can see through these lifetime contracts and enforce
+lifetime safety at call sites with higher accuracy. This approach supports
+gradual adoption in existing codebases. 
+
+.. note::
+  This analysis is designed for bug finding, not verification. It may miss some
+  lifetime issues and can produce false positives. It does not guarantee the
+  absence of all lifetime bugs.
+
+Getting Started
+----------------
+
+.. code-block:: c++
+
+  #include <string>
+  #include <string_view>
+
+  void simple_dangle() {
+    std::string_view v;
+    {
+      std::string s = "hello";
+      v = s;  // warning: object whose reference is captured does not live long enough
+    }         // note: destroyed here
+    std::cout << v; // note: later used here
+  }
+
+This example demonstrates
+a basic use-after-scope bug. The ``std::string_view`` object ``v`` holds a
+reference to ``s``, a ``std::string``. The lifetime of ``s`` ends at the end of
+the inner block, causing ``v`` to become a dangling reference.
+The analysis flags the assignment ``v = s`` as defective because ``s`` is
+destroyed while ``v`` is still alive and points to ``s``, and adds a note
+to where ``v`` is used after ``s`` has been destroyed.
+
+Running The Analysis
+--------------------
+
+To run the analysis, compile with the ``-Wlifetime-safety-permissive`` flag, e.g.
+
+.. code-block:: bash
+
+  clang -c -Wlifetime-safety-permissive example.cpp
+
+This flag enables a core set of lifetime safety checks. For more fine-grained
+control over warnings, see :ref:`warning_flags`.
+
+Lifetime Annotations
+====================
+
+While lifetime analysis can detect many issues without annotations, its
+precision increases significantly when types and functions are annotated with
+lifetime contracts. These annotations clarify ownership semantics and lifetime
+dependencies, enabling the analysis to reason more accurately about pointer
+validity across function calls.
+
+Owner and Pointer Types
+-----------------------
+
+Lifetime analysis distinguishes between types that own the data they point to
+(Owners) and types that are non-owning views or references to data owned by
+others (Pointers). This distinction is made using GSL-style attributes:
+
+*   ``[[gsl::Owner]]``: For types that manage the lifetime of a resource,
+    like ``std::string``, ``std::vector``, ``std::unique_ptr``.
+*   ``[[gsl::Pointer]]``: For non-owning types that borrow resources,
+    like ``std::string_view``, or raw pointers (which are
+    implicitly treated as pointers).
+
+Many common STL types, such as ``std::string_view`` and container iterators,
+are automatically recognized as Pointers or Owners. You can annotate your own
+types using these attributes:
+
+.. code-block:: c++
+
+  #include <string>
+  #include <string_view>
+
+  // Owner type
+  struct [[gsl::Owner]] MyObj {
+    std::string Data = "Hello";
+  };
+
+  // View type
+  struct [[gsl::Pointer]] View {
+    std::string_view SV;
+    View() = default;
+    View(const MyObj& O) : SV(O.Data) {}
+    void use() const {}
+  };
+
+  void test() {
+    View v;
+    {
+      MyObj o;
+      v = o; // warning: object whose reference is captured does not live long enough
+    }        // note: destroyed here
+    v.use(); // note: later used here
+  }
+
+Without these annotations, the analysis may not be able to determine whether a
+type is owning or borrowing, which can affect analysis precision. For more
+details on these attributes, see the Clang attribute reference for
+`gsl::Owner <https://clang.llvm.org/docs/AttributeReference.html#gsl-owner>`_ and
+`gsl::Pointer <https://clang.llvm.org/docs/AttributeReference.html#gsl-pointer>`_.
+
+.. note::
+  Types with mixed ownership semantics (owning some data while holding views to
+  other data) or types with multiple view fields with different lifetimes should
+  not be annotated.  The analysis does not yet support expressing such nuanced 
+  lifetime relationships.
+  Future enhancements, such as named lifetimes, may provide better support for
+  these patterns.
+
+LifetimeBound
+-------------
+
+The ``[[clang::lifetimebound]]`` attribute can be applied to function parameters
+or to the implicit ``this`` parameter of a method (by placing it after the
+method declarator). It indicates that the returned pointer or reference becomes
+invalid when the attributed parameter or ``this`` object is destroyed.
+This is crucial for functions that return views or references to their
+arguments.
+
+.. code-block:: c++
+
+  #include <string>
+  #include <string_view>
+
+  struct MyOwner {
+    std::string s;
+    std::string_view getView() const [[clang::lifetimebound]] { return s; }
+  };
+
+  void test_lifetimebound() {
+    std::string_view sv;
+    sv = MyOwner().getView(); // getView() is called on a temporary MyOwner
+                             // warning: object whose reference is captured does not live long enough
+                             // note: destroyed here
+    (void)sv;                // note: later used here
+  }
+
+Without ``[[clang::lifetimebound]]`` on ``getView()``, the analysis would not
+know that the value returned by ``getView()`` depends on the temporary
+``MyOwner`` object, and it would not be able to diagnose the dangling ``sv``.
+
+For more details, see `lifetimebound <https://clang.llvm.org/docs/AttributeReference.html#lifetimebound>`_.
+
+NoEscape
+--------
+
+The ``[[clang::noescape]]`` attribute can be applied to function parameters of
+pointer or reference type. It indicates that the function will not allow the
+parameter to escape its scope, for example, by returning it or assigning it to
+a field or global variable. This is useful for parameters passed to callbacks
+or visitors that are only used during the call and not stored.
+
+For more details, see `noescape <https://clang.llvm.org/docs/AttributeReference.html#noescape>`_.
+
+Checks Performed
+================
+
+
+.. raw:: html
+
+   <style>
+   /* Align text to left and add red/green colors */
+   table.colored-code-table td, table.colored-code-table th { text-align: left !important; }
+   table.colored-code-table td:first-child, table.colored-code-table th:first-child { background-color: #ffeaea !important; }
+   table.colored-code-table td:nth-child(2), table.colored-code-table th:nth-child(2) { background-color: #eafaea !important; }
+   table.colored-code-table td .highlight, table.colored-code-table td pre { background-color: transparent !important; border: none !important; }
+
+   div.bad-code { background-color: #ffeaea !important; padding: 5px; border-left: 4px solid #ff6b6b; text-align: left !important; }
+   div.bad-code .highlight, div.bad-code pre { background-color: transparent !important; border: none !important; }
+
+   div.good-code { background-color: #eafaea !important; padding: 5px; border-left: 4px solid #51cf66; text-align: left !important; }
+   div.good-code .highlight, div.good-code pre { background-color: transparent !important; border: none !important; }
+   </style>
+
+Use after scope
+---------------
+
+This check warns when a pointer or reference is used after the stack variable
+it refers to has gone out of scope.
+
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+   :class: colored-code-table
+
+   * - Use after scope
+     - Correct
+   * -
+       .. code-block:: c++
+
+         void foo() {
+           int* p;
+           {
+             int i = 0;
+             p = &i;  // warning: 'p' does not live long enough
+           }          // note: destroyed here
+           (void)*p;  // note: later used here
+         }
+     -
+       .. code-block:: c++
+
+         void foo() {
+           int i = 0;
+           int* p;
+           {
+             p = &i; // OK!
+           }
+           (void)*p;
+         }
+
+Return of stack address
+-----------------------
+
+This check warns when a function returns a pointer or reference to a
+stack-allocated variable, which will be destroyed when the function returns,
+leaving the caller with a dangling pointer.
+
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+   :class: colored-code-table
+
+   * - Return of stack address
+     - Correct
+   * -
+       .. code-block:: c++
+
+        #include <string>
+        #include <string_view>
+
+        std::string_view bar() {
+          std::string s = "on stack";
+          std::string_view result = s;
+          // warning: address of stack variable 's' is returned later
+          return result; // note: returned here
+        }
+     -
+       .. code-block:: c++
+
+        #include <string>
+        #include <string_view>
+
+        std::string bar() {
+          std::string s = "on stack";
+          std::string_view result = s;
+          return result; // OK!
+        }
+
+
+Dangling field
+--------------
+
+This check warns when a constructor or method assigns a pointer to a
+stack-allocated variable or temporary to a field of the class.
+
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+   :class: colored-code-table
+
+
+   * - Dangling field
+     - Correct
+   * -
+       .. code-block:: c++
+
+          #include <string>
+          #include <string_view>
+
+          // Constructor finishes, leaving 'field' dangling.
+          struct DanglingField {
+            std::string_view field; // note: this field dangles
+            DanglingField(std::string s) {
+              field = s; // warning: stack variable 's' escapes to a field
+            }
+          };
+     -
+       .. code-block:: c++
+
+          // Make the field an owner.
+          struct DanglingField {
+            std::string field;
+            DanglingField(std::string s) {
+              field = s;
+            }
+          };
+          // Or take a string_view parameter.
+          struct DanglingField {
+            std::string_view field;
+            DanglingField(std::string_view s [[clang::lifetimebound]]) {
+              field = s;
+            }
+          };
+         };
+
+
+Use after invalidation (experimental)
+-------------------------------------
+
+This check warns when a reference to a container element (such as an iterator,
+pointer or reference) is used after a container operation that may have
+invalidated it. For example, adding elements to ``std::vector`` may cause
+reallocation, invalidating all existing iterators, pointers and references to
+its elements.
+
+.. note::
+  Container invalidation checking is highly experimental and may produce false
+  positives.
+
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+   :class: colored-code-table
+
+
+   * - Use after invalidation (experimental)
+     - Correct
+   * -
+       .. code-block:: c++
+
+        #include <vector>
+
+        void baz(std::vector<int>& v) {
+          int* p = &v[0]; // warning: 'v' is later invalidated
+          v.push_back(4); // note: invalidated here
+          *p = 10;        // note: later used here
+        }
+     -
+       .. code-block:: c++
+
+        #include <vector>
+
+        void baz(std::vector<int>& v) {
+          v.push_back(4);
+          int* p = &v[0]; // OK!
+          *p = 10;
+        }
+
+
+Annotation Inference and Suggestions
+====================================
+
+In addition to detecting lifetime violations, the analysis can suggest adding
+``[[clang::lifetimebound]]`` to function parameters or methods when it detects
+that a pointer/reference to a parameter or ``this`` escapes via the return
+value. This helps improve API contracts and allows the analysis to perform
+more accurate checks in calling code.
+
+To enable annotation suggestions, use ``-Wlifetime-safety-suggestions``.
+
+.. code-block:: c++
+
+  #include <string_view>
+
+  // The analysis will suggest adding [[clang::lifetimebound]] to 'a'.
+  std::string_view return_view(std::string_view a) { 
+                            // ^^^^^^^^^^^^^^^^^^
+                            // warning: parameter 'a' should be marked [[clang::lifetimebound]]
+    return a;               // note: param returned here
+  }
+
+Translation-Unit-Wide Analysis and Inference
+--------------------------------------------
+
+By default, lifetime analysis is intra-procedural for error checking.
+However, for annotation inference to be effective, lifetime information needs
+to propagate across function calls. You can enable experimental
+translation-unit-wide analysis using:
+
+*   ``-flifetime-safety-inference``: Enables inference of ``lifetimebound``
+    attributes across functions in a TU.
+*   ``-fexperimental-lifetime-safety-tu-analysis``: Enables TU-wide analysis
+    for better inference results.
+
+.. _warning_flags:
+
+Warning flags
+=============
+
+Lifetime safety warnings are organized into hierarchical groups, allowing users to
+enable categories of checks incrementally. For example, ``-Wlifetime-safety``
+enables all dangling pointer checks, while ``-Wlifetime-safety-permissive``
+enables only the high-confidence subset of these checks.
+
+*  ``-Wlifetime-safety-all``: Enables all lifetime safety warnings, including
+    dangling pointer checks, annotation suggestions, and annotation validations.
+
+*  ``-Wlifetime-safety``: Enables dangling pointer checks from both the ``permissive`` and ``strict`` groups listed below.
+
+  * ``-Wlifetime-safety-permissive``: Enables high-confidence checks for dangling pointers. **Recommended for initial adoption.**
+
+    * ``-Wlifetime-safety-use-after-scope``: Warns when a pointer to a stack variable is used after the variable's lifetime has ended.
+    * ``-Wlifetime-safety-return-stack-addr``: Warns when a function returns a pointer or reference to one of its local stack variables.
+    * ``-Wlifetime-safety-dangling-field``: Warns when a class field is assigned a pointer to a temporary or stack variable whose lifetime is shorter than the class instance.
+  
+  * ``-Wlifetime-safety-strict``: Enables stricter and experimental checks. These may produce false positives in code that uses move semantics heavily, as the analysis might conservatively assume a use-after-free even if ownership was transferred.
+
+    *   ``-Wlifetime-safety-use-after-scope-moved``: Same as ``-Wlifetime-safety-use-after-scope`` but for cases where the variable may have been moved from before its destruction.
+    *   ``-Wlifetime-safety-return-stack-addr-moved``: Same as ``-Wlifetime-safety-return-stack-addr`` but for cases where the variable may have been moved from.
+    *   ``-Wlifetime-safety-dangling-field-moved``: Same as ``-Wlifetime-safety-dangling-field`` but for cases where the variable may have been moved from.
+    *   ``-Wlifetime-safety-invalidation``: Warns when a container iterator or reference to an element is used after an operation that may invalidate it (Experimental).
+
+*   ``-Wlifetime-safety-suggestions``: Enables suggestions to add ``[[clang::lifetimebound]]`` to function parameters and ``this`` parameters.
+
+  * ``-Wlifetime-safety-intra-tu-suggestions``: Suggestions for functions local to the translation unit.
+  * ``-Wlifetime-safety-cross-tu-suggestions``: Suggestions for functions visible across translation units (e.g., in headers).
+
+* ``-Wlifetime-safety-validations``: Enables checks that validate existing lifetime annotations.
+
+  * ``-Wlifetime-safety-noescape``: Warns when a parameter marked with ``[[clang::noescape]]`` escapes the function.
+
+Limitations
+===========
+
+Move Semantics
+--------------
+The analysis does not currently track ownership transfers through move operations.
+Instead, it uses scope-based lifetime tracking: when an owner goes out of scope,
+the analysis assumes the resource is destroyed, even if ownership was transferred
+via ``std::move()`` or ``std::unique_ptr::release()``.
+
+This means that if a pointer or view is created from an owner, and that owner is
+later moved-from and goes out of scope, the analysis will issue a
+``-Wlifetime-safety-*-moved`` warning. This warning indicates that the pointer
+may be dangling, even though the resource may still be alive under a new owner.
+These are often false positives when ownership has been safely transferred.
+
+To avoid these warnings and ensure correctness, follow the
+**"move-first-then-alias"** pattern: create views or raw pointers *after* the
+ownership transfer, sourcing them from the new owner rather than the original
+owner that will go out of scope.
+
+For example:
+
+.. list-table::
+   :widths: 50 50
+   :header-rows: 1
+   :align: left
+   :class: colored-code-table
+
+   * - Anti-Pattern: Aliasing Before Move
+     - Good Practice: Move-First-Then-Alias
+   * -
+       .. code-block:: c++
+
+         #include <memory>
+
+         void use(int*);
+
+         void bar() {
+           std::unique_ptr<int> b;
+           int* p;
+           {
+             auto a = std::make_unique<int>(42);
+             p = a.get(); // warning!
+             b = std::move(a);
+           }
+           use(p);
+         }
+     -
+       .. code-block:: c++
+
+         #include <memory>
+
+         void use(int*);
+
+         void bar() {
+           std::unique_ptr<int> b;
+           int* p;
+           {
+             auto a = std::make_unique<int>(42);
+             b = std::move(a);
+             p = b.get(); // OK!
+           }
+           use(p);
+         }
+
+The same principle applies when moving ownership using ``std::unique_ptr::release()``:
+
+.. code-block:: c++
+  :class: bad-code
+
+  #include <memory>
+  #include <utility>
+
+  void use(int*);
+  void take_ownership(int*);
+
+  void test_aliasing_before_release() {
+    int* p;
+    {
+      auto u = std::make_unique<int>(1);
+      p = u.get();
+      //  ^ warning: 'u' does not live long enough!
+      take_ownership(u.release());
+    } 
+    use(p);  
+  }
+
+``std::unique_ptr`` with custom deleters
+----------------------------------------
+The analysis assumes standard ownership semantics for owner types like
+``std::unique_ptr``: when a ``unique_ptr`` goes out of scope, it is assumed
+that the owned object is destroyed and its memory is deallocated.
+However, ``std::unique_ptr`` can be used with a custom deleter that modifies
+this behavior. For example, a custom deleter might keep the memory alive
+by transferring it to a memory pool, or simply do nothing, allowing
+another system to manage the lifetime.
+
+Because the analysis relies on scope-based lifetime for owners, it does not
+support custom deleters that extend the lifetime of the owned object beyond
+the lifetime of the ``std::unique_ptr``. In such cases, the analysis will
+assume the object is destroyed when the ``std::unique_ptr`` goes out of scope,
+leading to false positive warnings if pointers to the object are used afterward.
+
+.. code-block:: c++
+
+  #include <memory>
+
+  void use(int*);
+
+  struct NoOpDeleter {
+    void operator()(int* p) const {
+      // Do not delete p, memory is managed elsewhere.
+    }
+  };
+
+  void test_custom_deleter() {
+    int* p;
+    {
+      std::unique_ptr<int, NoOpDeleter> u(new int(42));
+      p = u.get();  // warning: object whose reference is captured does not live long enough
+    }               // note: destroyed here
+    // With NoOpDeleter, p would still be valid here.
+    // But analysis assumes standard unique_ptr semantics and memory being freed.
+    use(p);         // note: later used here
+  }
+
+Dangling Fields
+---------------
+The lifetime analysis is intra-procedural. It analyzes one function or method at
+a time.
+This means if a field is assigned a pointer to a local variable or temporary
+inside a constructor or method, and that local's lifetime ends before the method
+returns, the analysis will issue a ``-Wlifetime-safety-dangling-field`` warning.
+It must do so even if no *other* method of the class ever accesses this field,
+because it cannot see how other methods are implemented or used.
+
+.. code-block:: c++
+
+  #include <string>
+  #include <string_view>
+
+  struct MyWidget {
+    std::string_view name_; // note: this field dangles
+    MyWidget(std::string name) : name_(name) {} // warning: address of stack memory escapes to a field
+    const char* data() { return name_.data(); } // Potential use-after-free if called
+  };
+
+In this case, ``name_`` dangles after the constructor finishes.
+Even if ``data()`` is never called, the analysis flags the dangling assignment
+in the constructor because it represents a latent bug.
+The recommended approach is to ensure fields only point to objects that outlive
+the field itself, for example by storing an owned object (e.g., ``std::string``)
+or ensuring the borrowed object (e.g., one passed by ``const&``) has a
+sufficient lifetime.
+
+
+Performance
+===========
+
+Lifetime analysis relies on Clang's CFG (Control Flow Graph). For functions
+with very large or complex CFGs, analysis time can sometimes be significant. To mitigate
+this, the analysis allows to skip functions where the number of CFG blocks exceeds
+a certain threshold, controlled by the ``-flifetime-safety-max-cfg-blocks=N`` language
+option.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bacf2761ca816..7ed56ce6ae6a6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -286,6 +286,9 @@ Improvements to Clang's diagnostics
   when parsing alias declarations involving a token-split ``>>`` sequence
   (for example, ``using A = X<int>>;``). (#GH184425)
 
+- Fixed incorrect ``implicitly deleted`` diagnostic for explicitly deleted
+  candidate function. (#GH185693)
+
 - The ``-Wloop-analysis`` warning has been extended to catch more cases of
   variable modification inside lambda expressions (#GH132038).
 
@@ -327,6 +330,7 @@ Bug Fixes in This Version
 - Fixed a bug with multiple-include optimization (MIOpt) state not being preserved in some cases during lexing, which could suppress header-guard mismatch diagnostics and interfere with include-guard optimization. (#GH180155)
 - Fixed a crash when normalizing constraints involving concept template parameters whose index coincided with non-concept template parameters in the same parameter mapping.
 - Fixed a crash caused by accessing dependent diagnostics of a non-dependent context.
+- Fixed a crash when substituting into a non-type template parameter that has a type containing an undeduced placeholder type.
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -345,6 +349,7 @@ Bug Fixes to C++ Support
   template parameters when one of its parameters is also a pack. (#GH181166)
 - Fixed a crash when a default argument is passed to an explicit object parameter. (#GH176639)
 - Fixed a crash when diagnosing an invalid static member function with an explicit object parameter (#GH177741)
+- Fixed a crash when instantiating an invalid out-of-line static data member definition in a local class. (#GH176152)
 - Fixed a crash when pack expansions are used as arguments for non-pack parameters of built-in templates. (#GH180307)
 - Fixed a bug where captured variables in non-mutable lambdas were incorrectly treated as mutable 
   when used inside decltype in the return type. (#GH180460)
@@ -352,6 +357,7 @@ Bug Fixes to C++ Support
 - Fixed a crash when `explicit(bool)` is used with an incomplete enumeration. (#GH183887)
 - Fixed a crash on ``typeid`` of incomplete local types during template instantiation. (#GH63242), (#GH176397)
 - Fixed a crash when an immediate-invoked ``consteval`` lambda is used as an invalid initializer. (#GH185270)
+- Fixed an assertion failure when using a global destructor with a target with a non-default program address space. (#GH186484)
 
 - Inherited constructors in ``dllexport`` classes are now exported for ABI-compatible cases, matching 
   MSVC behavior. Constructors with variadic arguments or callee-cleanup parameters are not yet supported 
@@ -481,6 +487,7 @@ clang-format
   constructor initializers after commas, keeping the colon on the same line.
 - Extend ``BreakBinaryOperations`` to accept a structured configuration with
   per-operator break rules and minimum chain length gating via ``PerOperator``.
+- Add ``AllowShortRecordOnASingleLine`` option and set it to ``EmptyAndAttached`` for LLVM style.
 
 libclang
 --------
diff --git a/clang/docs/SanitizerCoverage.rst b/clang/docs/SanitizerCoverage.rst
index 4ab2d09366f4f..c01863adebb2d 100644
--- a/clang/docs/SanitizerCoverage.rst
+++ b/clang/docs/SanitizerCoverage.rst
@@ -201,6 +201,22 @@ With ``-fsanitize-coverage=trace-pc`` the compiler will insert
 ``__sanitizer_cov_trace_pc()`` on every edge.
 With an additional ``...=trace-pc,indirect-calls`` flag
 ``__sanitizer_cov_trace_pc_indirect(void *callee)`` will be inserted on every indirect call.
+
+With ``-fsanitize-coverage=trace-pc-entry-exit`` the compiler will insert
+``__sanitizer_cov_trace_pc_entry()`` on function entry, and insert
+``__sanitizer_cov_trace_pc_exit()`` on function return;
+``-fsanitize-coverage=trace-pc`` or ``-fsanitize-coverage=trace-pc-guard`` must
+still be passed to instrument all basic blocks.
+
+With the combination ``-fsanitize-coverage=trace-pc-entry-exit,trace-pc``,
+``__sanitizer_cov_trace_pc()`` will be omitted in the entry basic block because
+the block is already covered by ``__sanitizer_cov_trace_pc_entry()``, which can
+be used to both record that the function has been entered and record coverage of
+the entry basic block.
+However, with ``-fsanitize-coverage=trace-pc-entry-exit,trace-pc-guard``, both
+callbacks are called for the entry block because
+``__sanitizer_cov_trace_pc_entry()`` does not provide a `guard_variable`.
+
 These callbacks are not implemented in the Sanitizer run-time and should be defined
 by the user.
 This mechanism is used for fuzzing the Linux kernel
diff --git a/clang/docs/ScalableStaticAnalysisFramework/user-docs/SummaryExtraction.rst b/clang/docs/ScalableStaticAnalysisFramework/user-docs/SummaryExtraction.rst
index f7472bfd8643a..96892cfa5fdea 100644
--- a/clang/docs/ScalableStaticAnalysisFramework/user-docs/SummaryExtraction.rst
+++ b/clang/docs/ScalableStaticAnalysisFramework/user-docs/SummaryExtraction.rst
@@ -13,6 +13,8 @@ Two flags control summary extraction:
 
 - ``--ssaf-extract-summaries=<name1>,<name2>,...``: Comma-separated list of summary extractor names to enable.
 - ``--ssaf-tu-summary-file=<path>.<format>``: Output file for the extracted summaries. The file extension selects the serialization format (e.g. ``.json``).
+- ``--ssaf-list-extractors``: List the available summary extractors.
+- ``--ssaf-list-formats``: List the available serialization formats.
 
 Example invocation:
 
@@ -22,6 +24,8 @@ Example invocation:
         --ssaf-tu-summary-file=my-tu-summary.json \
         -c input.cpp -o input.o
 
+  clang --ssaf-list-extractors --ssaf-list-formats
+
 Diagnostics
 ***********
 
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index c2974a4b2f9ea..89ca6d73d9d8d 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -27,6 +27,7 @@ Using Clang as a Compiler
    CrossCompilation
    ClangStaticAnalyzer
    ThreadSafetyAnalysis
+   LifetimeSafety
    SafeBuffers
    ScalableStaticAnalysisFramework/index
    DataFlowAnalysisIntro
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 05302c30d18d1..8a3d202871cf8 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -389,8 +389,9 @@ class ASTContext : public RefCountedBase<ASTContext> {
   mutable llvm::DenseMap<const CXXDestructorDecl *, FunctionDecl *>
       GlobalArrayOperatorDeletesForVirtualDtor;
 
-  /// To remember which types did require a vector deleting dtor.
-  llvm::DenseSet<const CXXRecordDecl *> RequireVectorDeletingDtor;
+  /// To remember for which types we met new[] call, these potentially require a
+  /// vector deleting dtor.
+  llvm::DenseSet<const CXXRecordDecl *> MaybeRequireVectorDeletingDtor;
 
   /// The next string literal "version" to allocate during constant evaluation.
   /// This is used to distinguish between repeated evaluations of the same
@@ -3561,8 +3562,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
                                           OperatorDeleteKind K) const;
   bool dtorHasOperatorDelete(const CXXDestructorDecl *Dtor,
                              OperatorDeleteKind K) const;
-  void setClassNeedsVectorDeletingDestructor(const CXXRecordDecl *RD);
-  bool classNeedsVectorDeletingDestructor(const CXXRecordDecl *RD);
+  void setClassMaybeNeedsVectorDeletingDestructor(const CXXRecordDecl *RD);
+  bool classMaybeNeedsVectorDeletingDestructor(const CXXRecordDecl *RD);
 
   /// Retrieve the context for computing mangling numbers in the given
   /// DeclContext.
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 8ab4aaa6f5781..61194e3c2c940 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4715,17 +4715,15 @@ def OMPDeclareTargetDecl : InheritableAttr {
   let SemaHandler = 0;
   let Subjects = SubjectList<[Function, SharedVar]>;
   let Documentation = [OMPDeclareTargetDocs];
-  let Args = [
-    EnumArgument<"MapType", "MapTypeTy", /*is_string=*/false,
-                 [ "to", "enter", "link" ],
-                 [ "MT_To", "MT_Enter", "MT_Link" ]>,
-    EnumArgument<"DevType", "DevTypeTy", /*is_string=*/false,
-                 [ "host", "nohost", "any" ],
-                 [ "DT_Host", "DT_NoHost", "DT_Any" ]>,
-    ExprArgument<"IndirectExpr">,
-    BoolArgument<"Indirect">,
-    UnsignedArgument<"Level">
-  ];
+  let Args = [EnumArgument<
+                  "MapType", "MapTypeTy",
+                  /*is_string=*/false, ["to", "enter", "link", "local"],
+                  ["MT_To", "MT_Enter", "MT_Link", "MT_Local"]>,
+              EnumArgument<"DevType", "DevTypeTy",
+                           /*is_string=*/false, ["host", "nohost", "any"],
+                           ["DT_Host", "DT_NoHost", "DT_Any"]>,
+              ExprArgument<"IndirectExpr">, BoolArgument<"Indirect">,
+              UnsignedArgument<"Level">];
   let AdditionalMembers = [{
     void printPrettyPragma(raw_ostream &OS, const PrintingPolicy &Policy) const;
     static std::optional<MapTypeTy>
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 718df8c7154a2..815ed11ee3b22 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -2686,8 +2686,10 @@ path of execution, but that can be confusing:
 .. code-block:: c++
 
   if (b) {
-    [[unlikely]] --b; // In the path of execution,
-                      // this branch is considered unlikely.
+    [[unlikely]] --b; // Per the standard this is in the path of
+                      // execution, so this branch should be considered
+                      // unlikely. However, Clang ignores the attribute
+                      // here since it is not on the substatement.
   }
 
   if (b) {
@@ -6011,6 +6013,25 @@ TUs where it is used.
 This attribute can be used on static and non-static member functions of class
 templates, static data members of class templates and member classes of class
 templates.
+
+**Interaction with __declspec(dllexport/dllimport)**
+
+For a DLL platform (i.e., Windows), this attribute also means "this member will
+never be exported or imported". Despite its name, this semantics applies to
+implicit instantiations and non-template entities as well.
+
+  .. code-block:: c++
+
+    // in <exception>
+    class __declspec(dllimport) nested_exception {
+      ...
+    public:
+      __attribute__((exclude_from_explicit_instantiation))
+      exception_ptr nested_ptr() const noexcept { ... }
+    };
+
+In this case, ``nested_exception::nested_ptr`` will never be attempted to be
+imported.
   }];
 }
 
diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 857fe35e12b2b..d883552ea2198 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -308,6 +308,8 @@ CODEGENOPT(SanitizeCoverage8bitCounters, 1, 0, Benign) ///< Use 8-bit frequency
                                                        ///< in sanitizer coverage.
 CODEGENOPT(SanitizeCoverageTracePC, 1, 0, Benign) ///< Enable PC tracing
                                                   ///< in sanitizer coverage.
+CODEGENOPT(SanitizeCoverageTracePCEntryExit, 1, 0, Benign) ///< Trace function entry/exit
+                                                           ///< in sanitizer coverage.
 CODEGENOPT(SanitizeCoverageTracePCGuard, 1, 0, Benign) ///< Enable PC tracing with guard
                                                        ///< in sanitizer coverage.
 CODEGENOPT(SanitizeCoverageInline8bitCounters, 1, 0, Benign) ///< Use inline 8bit counters.
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 5c62bb70ebd0f..00db1e7ee5afa 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -379,6 +379,29 @@ def warn_profile_data_misexpect : Warning<
   BackendInfo, InGroup<MisExpect>;
 } // end of instrumentation issue category
 
+def warn_ssaf_extract_tu_summary_file_unknown_output_format :
+  Warning<"unknown output summary file format '%0' "
+  "specified by '--ssaf-tu-summary-file=%1'">,
+  InGroup<ScalableStaticAnalysisFramework>, DefaultError;
+
+def warn_ssaf_extract_tu_summary_file_unknown_format :
+  Warning<"failed to parse the value of '--ssaf-tu-summary-file=%0' "
+  "the value must follow the '<path>.<format>' pattern">,
+  InGroup<ScalableStaticAnalysisFramework>, DefaultError;
+
+def warn_ssaf_must_enable_summary_extractors :
+  Warning<"must enable some summary extractors using the "
+        "'--ssaf-extract-summaries=' option">,
+  InGroup<ScalableStaticAnalysisFramework>, DefaultError;
+
+def warn_ssaf_extract_summary_unknown_extractor_name :
+  Warning<"no summary extractor%s0 %plural{1:was|:were}0 registered with name: %1">,
+  InGroup<ScalableStaticAnalysisFramework>, DefaultError;
+
+def warn_ssaf_write_tu_summary_failed :
+  Warning<"failed to write TU summary to '%0': %1">,
+  InGroup<ScalableStaticAnalysisFramework>, DefaultError;
+
 def err_extract_api_ignores_file_not_found :
   Error<"file '%0' specified by '--extract-api-ignores=' not found">, DefaultFatal;
 
diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 5d39f12d5c00f..e440c9d2fb982 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -1907,6 +1907,9 @@ def BitIntExtension : DiagGroup<"bit-int-extension">;
 // Warnings about misuse of ExtractAPI options.
 def ExtractAPIMisuse : DiagGroup<"extractapi-misuse">;
 
+// Warnings related to the "Scalable Static Analysis Framework" - SSAF.
+def ScalableStaticAnalysisFramework : DiagGroup<"scalable-static-analysis-framework">;
+
 // Warnings about using the non-standard extension having an explicit specialization
 // with a storage class specifier.
 def ExplicitSpecializationStorageClass : DiagGroup<"explicit-specialization-storage-class">;
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index de10dbe5d0628..19edc7f7a3b23 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -1558,18 +1558,21 @@ def note_omp_assumption_clause_continue_here
     : Note<"the ignored tokens spans until here">;
 def err_omp_declare_target_unexpected_clause: Error<
   "unexpected '%0' clause, only %select{'device_type'|'to' or 'link'|'to', 'link' or 'device_type'|'device_type', 'indirect'|'to', 'link', 'device_type' or 'indirect'}1 clauses expected">;
-def err_omp_declare_target_unexpected_clause_52: Error<
-  "unexpected '%0' clause, only %select{'device_type'|'enter' or 'link'|'enter', 'link' or 'device_type'|'device_type', 'indirect'|'enter', 'link', 'device_type' or 'indirect'}1 clauses expected">;
+def err_omp_declare_target_unexpected_clause_52
+    : Error<"unexpected '%0' clause, only %select{'device_type'|'enter' or "
+            "'link'|'enter', 'link' or 'device_type'|'device_type', "
+            "'indirect'|'enter', 'link', 'device_type' or 'indirect'|'enter', "
+            "'link', 'device_type', 'indirect' or 'local'}1 clauses expected">;
 def err_omp_begin_declare_target_unexpected_implicit_to_clause: Error<
   "unexpected '(', only 'to', 'link' or 'device_type' clauses expected for 'begin declare target' directive">;
 def err_omp_declare_target_wrong_clause_after_implicit_to: Error<
   "unexpected clause after an implicit 'to' clause">;
 def err_omp_declare_target_wrong_clause_after_implicit_enter: Error<
   "unexpected clause after an implicit 'enter' clause">;
-def err_omp_declare_target_missing_to_or_link_clause: Error<
-  "expected at least one %select{'to' or 'link'|'to', 'link' or 'indirect'}0 clause">;
-def err_omp_declare_target_missing_enter_or_link_clause: Error<
-  "expected at least one %select{'enter' or 'link'|'enter', 'link' or 'indirect'}0 clause">;
+def err_omp_declare_target_missing_required_clause
+    : Error<"expected at least one %select{'to' or 'link'|'to', 'link' or "
+            "'indirect'|'enter', 'link' or 'indirect'|'enter', 'link', "
+            "'indirect' or 'local'}0 clause">;
 def err_omp_declare_target_unexpected_to_clause: Error<
   "unexpected 'to' clause, use 'enter' instead">;
 def err_omp_declare_target_unexpected_enter_clause: Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index fae63cc0ba139..d4d09a8ecef36 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3799,12 +3799,6 @@ def warn_nothrow_attribute_ignored : Warning<"'nothrow' attribute conflicts with
 def warn_dllattr_ignored_exclusion_takes_precedence : Warning<
   "%0 attribute ignored; %1 takes precedence">,
   InGroup<IgnoredAttributes>;
-def warn_attribute_ignored_on_non_member :
-  Warning<"%0 attribute ignored on a non-member declaration">,
-  InGroup<IgnoredAttributes>;
-def warn_attribute_ignored_in_non_template :
-  Warning<"%0 attribute ignored in a non-template context">,
-  InGroup<IgnoredAttributes>;
 def warn_attribute_ignored_on_non_definition :
   Warning<"%0 attribute ignored on a non-definition declaration">,
   InGroup<IgnoredAttributes>;
@@ -12128,13 +12122,16 @@ def warn_omp_alignment_not_power_of_two : Warning<
   InGroup<OpenMPClauses>;
 def err_omp_invalid_target_decl : Error<
   "%0 used in declare target directive is not a variable or a function name">;
-def err_omp_declare_target_to_and_link : Error<
-  "%0 must not appear in both clauses 'to' and 'link'">;
+def err_omp_declare_target_var_in_both_clauses
+    : Error<"%0 must not appear in both clauses '%1' and '%2'">;
+def err_omp_declare_target_local_host_only
+    : Error<"'local' clause is incompatible with 'device_type(host)'; "
+            "local variables exist only on the device">;
 def warn_omp_not_in_target_context : Warning<
   "declaration is not declared in any declare target region">,
   InGroup<OpenMPTarget>;
-def err_omp_function_in_link_clause : Error<
-  "function name is not allowed in 'link' clause">;
+def err_omp_function_in_target_clause_list
+    : Error<"function name is not allowed in '%0' clause">;
 def err_omp_aligned_expected_array_or_ptr : Error<
   "argument of aligned clause should be array"
   "%select{ or pointer|, pointer, reference to array or reference to pointer}1"
@@ -12550,6 +12547,11 @@ def err_omp_declare_target_has_local_vars : Error<
 def warn_omp_declare_target_after_first_use : Warning<
   "declaration marked as declare target after first use, it may lead to incorrect results">,
   InGroup<OpenMPTarget>;
+def warn_omp_declare_target_local_not_implemented
+    : Warning<"'local' clause on 'declare_target' directive is not yet fully "
+              "implemented; "
+              "variable will be treated as 'enter'">,
+      InGroup<OpenMPTarget>;
 def err_omp_declare_variant_incompat_attributes : Error<
   "'#pragma omp declare variant' is not compatible with any target-specific attributes">;
 def warn_omp_declare_variant_score_not_constant
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index e60288c40132f..8357e0dc73eb8 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -231,9 +231,12 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return createLoad(loc, ptr, /*isVolatile=*/false, alignment);
   }
 
+  mlir::Value createNot(mlir::Location loc, mlir::Value value) {
+    return cir::NotOp::create(*this, loc, value);
+  }
+
   mlir::Value createNot(mlir::Value value) {
-    return cir::UnaryOp::create(*this, value.getLoc(), value.getType(),
-                                cir::UnaryOpKind::Not, value);
+    return createNot(value.getLoc(), value);
   }
 
   /// Create a do-while operation.
@@ -272,9 +275,19 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
     return cir::ContinueOp::create(*this, loc);
   }
 
-  mlir::Value createUnaryOp(mlir::Location loc, cir::UnaryOpKind kind,
-                            mlir::Value operand) {
-    return cir::UnaryOp::create(*this, loc, kind, operand);
+  mlir::Value createInc(mlir::Location loc, mlir::Value input,
+                        bool nsw = false) {
+    return cir::IncOp::create(*this, loc, input, nsw);
+  }
+
+  mlir::Value createDec(mlir::Location loc, mlir::Value input,
+                        bool nsw = false) {
+    return cir::DecOp::create(*this, loc, input, nsw);
+  }
+
+  mlir::Value createMinus(mlir::Location loc, mlir::Value input,
+                          bool nsw = false) {
+    return cir::MinusOp::create(*this, loc, input, nsw);
   }
 
   mlir::TypedAttr getConstPtrAttr(mlir::Type type, int64_t value) {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index a9b98b1f43b3f..d4f0b96cf215c 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -1702,50 +1702,132 @@ def CIR_LabelOp : CIR_Op<"label", [AlwaysSpeculatable]> {
 }
 
 //===----------------------------------------------------------------------===//
-// UnaryOp
+// Unary Operations
 //===----------------------------------------------------------------------===//
 
-def CIR_UnaryOpKind : CIR_I32EnumAttr<"UnaryOpKind", "unary operation kind", [
-  I32EnumAttrCase<"Inc",   0, "inc">,
-  I32EnumAttrCase<"Dec",   1, "dec">,
-  I32EnumAttrCase<"Plus",  2, "plus">,
-  I32EnumAttrCase<"Minus", 3, "minus">,
-  I32EnumAttrCase<"Not",   4, "not">
-]>;
+// Base class for all CIR unary operations.
+// `type` constrains the input/result type.
+class CIR_UnaryOp<string mnemonic, Type type, list<Trait> traits = []>
+    : CIR_Op<mnemonic, !listconcat([
+        Pure, SameOperandsAndResultType,
+        DeclareOpInterfaceMethods<CIR_UnaryOpInterface>
+      ], traits)>
+{
+  let arguments = (ins type:$input);
+
+  let results = (outs type:$result);
 
-def CIR_UnaryOp : CIR_Op<"unary", [Pure, SameOperandsAndResultType]> {
-  let summary = "Unary operations";
+  let assemblyFormat = [{
+    $input `:` type($input) attr-dict
+  }];
+}
+
+// Base class for unary ops that support the nsw (no signed wrap) flag.
+class CIR_UnaryOpWithOverflowFlag<string mnemonic, Type type,
+                                   list<Trait> traits = []>
+    : CIR_UnaryOp<mnemonic, type, traits>
+{
+  let arguments = (ins type:$input, UnitProp:$no_signed_wrap);
+
+  let assemblyFormat = [{
+    (`nsw` $no_signed_wrap^)?
+    $input `:` type($input) attr-dict
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// IncOp
+//===----------------------------------------------------------------------===//
+
+def CIR_IncOp : CIR_UnaryOpWithOverflowFlag<"inc", CIR_AnyType> {
+  let summary = "Increment by one";
   let description = [{
-    `cir.unary` performs the unary operation according to
-    the specified opcode kind: [inc, dec, plus, minus, not].
+    The `cir.inc` operation increments the operand by one. The operand and
+    result must have the same type.
 
-    It requires one input operand and has one result, both types
-    should be the same.
+    The optional `nsw` (no signed wrap) attribute indicates that the result
+    is poison if signed overflow occurs.
 
-    If the `nsw` (no signed wrap) attribute is present, the result is poison if
-    signed overflow occurs.
+    Example:
 
     ```mlir
-    %7 = cir.unary(inc, %1) : i32 -> i32
-    %8 = cir.unary(dec, %2) nsw : i32 -> i32
+    %1 = cir.inc %0 : !s32i
+    %3 = cir.inc nsw %2 : !s32i
     ```
   }];
 
-  let arguments = (ins
-    Arg<CIR_UnaryOpKind, "unary op kind">:$kind,
-    Arg<CIR_AnyType>:$input,
-    UnitAttr:$no_signed_wrap
-  );
+  let hasFolder = 1;
+}
 
-  let results = (outs CIR_AnyType:$result);
+//===----------------------------------------------------------------------===//
+// DecOp
+//===----------------------------------------------------------------------===//
 
-  let assemblyFormat = [{
-      `(` $kind `,` $input `)`
-      (`nsw` $no_signed_wrap^)?
-      `:` type($input) `,` type($result) attr-dict
+def CIR_DecOp : CIR_UnaryOpWithOverflowFlag<"dec", CIR_AnyType> {
+  let summary = "Decrement by one";
+  let description = [{
+    The `cir.dec` operation decrements the operand by one. The operand and
+    result must have the same type.
+
+    The optional `nsw` (no signed wrap) attribute indicates that the result
+    is poison if signed overflow occurs.
+
+    Example:
+
+    ```mlir
+    %1 = cir.dec %0 : !s32i
+    %3 = cir.dec nsw %2 : !s32i
+    ```
+  }];
+
+  let hasFolder = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// MinusOp
+//===----------------------------------------------------------------------===//
+
+def CIR_MinusOp : CIR_UnaryOpWithOverflowFlag<"minus", CIR_AnyType> {
+  let summary = "Unary minus (negation)";
+  let description = [{
+    The `cir.minus` operation negates the operand. The operand and result
+    must have the same type.
+
+    The optional `nsw` (no signed wrap) attribute indicates that the result
+    is poison if signed overflow occurs (e.g. negating the minimum signed
+    integer).
+
+    Example:
+
+    ```mlir
+    %1 = cir.minus %0 : !s32i
+    %3 = cir.minus nsw %2 : !s32i
+    %5 = cir.minus %4 : !cir.float
+    ```
+  }];
+
+  let hasFolder = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// NotOp
+//===----------------------------------------------------------------------===//
+
+def CIR_NotOp : CIR_UnaryOp<"not", CIR_AnyType, [Involution]> {
+  let summary = "Bitwise NOT / logical NOT";
+  let description = [{
+    The `cir.not` operation performs a bitwise NOT on integer types or a
+    logical NOT on boolean types. The operand and result must have the same
+    type.
+
+    Example:
+
+    ```mlir
+    %1 = cir.not %0 : !s32i
+    %3 = cir.not %2 : !cir.bool
+    ```
   }];
 
-  let hasVerifier = 1;
   let hasFolder = 1;
 }
 
@@ -2333,8 +2415,9 @@ def CIR_RemOp : CIR_BinaryOp<"rem", CIR_AnyArithType> {
 // AndOp
 //===----------------------------------------------------------------------===//
 
-// FIXME: Commutative, Idempotent traits
-def CIR_AndOp : CIR_BinaryOp<"and", CIR_AnyBitwiseType> {
+def CIR_AndOp : CIR_BinaryOp<"and", CIR_AnyBitwiseType, [
+  Commutative, Idempotent
+]> {
   let summary = "Bitwise AND";
   let description = [{
     The `cir.and` operation performs a bitwise AND on integer operands.
@@ -2353,8 +2436,9 @@ def CIR_AndOp : CIR_BinaryOp<"and", CIR_AnyBitwiseType> {
 // OrOp
 //===----------------------------------------------------------------------===//
 
-// FIXME: Commutative, Idempotent traits
-def CIR_OrOp : CIR_BinaryOp<"or", CIR_AnyBitwiseType> {
+def CIR_OrOp : CIR_BinaryOp<"or", CIR_AnyBitwiseType, [
+  Commutative, Idempotent
+]> {
   let summary = "Bitwise OR";
   let description = [{
     The `cir.or` operation performs a bitwise OR on integer operands.
@@ -2373,7 +2457,7 @@ def CIR_OrOp : CIR_BinaryOp<"or", CIR_AnyBitwiseType> {
 // XorOp
 //===----------------------------------------------------------------------===//
 
-def CIR_XorOp : CIR_BinaryOp<"xor", CIR_AnyBitwiseType> {
+def CIR_XorOp : CIR_BinaryOp<"xor", CIR_AnyBitwiseType, [Commutative]> {
   let summary = "Bitwise XOR";
   let description = [{
     The `cir.xor` operation performs a bitwise XOR on integer operands.
@@ -2392,17 +2476,44 @@ def CIR_XorOp : CIR_BinaryOp<"xor", CIR_AnyBitwiseType> {
 // MaxOp
 //===----------------------------------------------------------------------===//
 
-def CIR_MaxOp : CIR_BinaryOp<"max", CIR_AnyIntOrVecOfIntType> {
+def CIR_MaxOp : CIR_BinaryOp<"max", CIR_AnyIntOrVecOfIntType, [
+  Commutative, Idempotent
+]> {
   let summary = "Integer maximum";
   let description = [{
     The `cir.max` operation computes the maximum of two integer operands.
-    Both operands and the result must have the same integer type.
+    Both operands and the result must have the same integer type or vector of
+    integer type.
 
     Example:
 
     ```mlir
     %0 = cir.max %a, %b : !s32i
     %1 = cir.max %a, %b : !u32i
+    %2 = cir.max %a, %b : !cir.vector<4 x !s32i>
+    ```
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// MinOp
+//===----------------------------------------------------------------------===//
+
+def CIR_MinOp : CIR_BinaryOp<"min", CIR_AnyIntOrVecOfIntType, [
+  Commutative, Idempotent
+]> {
+  let summary = "Integer minimum";
+  let description = [{
+    The `cir.min` operation computes the minimum of two integer operands.
+    Both operands and the result must have the same integer type or vector of
+    integer type.
+
+    Example:
+
+    ```mlir
+    %0 = cir.min %a, %b : !s32i
+    %1 = cir.min %a, %b : !u32i
+    %2 = cir.min %a, %b : !cir.vector<4 x !s32i>
     ```
   }];
 }
@@ -5595,7 +5706,7 @@ def FPClassTestEnum : CIR_I32EnumAttr<"FPClassTest", "floating-point class test
   let cppNamespace = "::cir";
 }
 
-def CIR_IsFPClassOp : CIR_Op<"is_fp_class"> {
+def CIR_IsFPClassOp : CIR_Op<"is_fp_class", [Pure]> {
   let summary = "Corresponding to the `__builtin_fpclassify` builtin function in clang";
 
   let description = [{
diff --git a/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td b/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td
index c715e2e03a7b9..41885d4df8f20 100644
--- a/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td
+++ b/clang/include/clang/CIR/Interfaces/CIROpInterfaces.td
@@ -189,6 +189,27 @@ let cppNamespace = "::cir" in {
     ];
   }
 
+  def CIR_UnaryOpInterface : OpInterface<"UnaryOpInterface"> {
+    let description = [{
+      Common interface for CIR unary operations.
+      Provides uniform access to the input operand and result of any unary
+      operation in the CIR dialect.
+    }];
+
+    let methods = [
+      InterfaceMethod<"Return the input operand.",
+        "mlir::Value", "getInput", (ins), [{}],
+        /*defaultImplementation=*/[{
+          return $_op.getInput();
+        }]>,
+      InterfaceMethod<"Return the result value.",
+        "mlir::Value", "getResult", (ins), [{}],
+        /*defaultImplementation=*/[{
+          return $_op.getResult();
+        }]>,
+    ];
+  }
+
 } // namespace cir
 
 #endif // CLANG_CIR_INTERFACES_CIROPINTERFACES_TD
diff --git a/clang/include/clang/Config/config.h.cmake b/clang/include/clang/Config/config.h.cmake
index 00c352b458c34..05e4deb671a7e 100644
--- a/clang/include/clang/Config/config.h.cmake
+++ b/clang/include/clang/Config/config.h.cmake
@@ -85,4 +85,10 @@
 /* Whether CIR is built into Clang */
 #cmakedefine01 CLANG_ENABLE_CIR
 
+/* Whether to use xcselect to find the macOS SDK */
+#cmakedefine CLANG_USE_XCSELECT
+
+/* Policy to use for xcselect */
+#cmakedefine CLANG_XCSELECT_HOST_SDK_POLICY ${CLANG_XCSELECT_HOST_SDK_POLICY}
+
 #endif
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index aea18a836328f..45369e2e142c2 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -823,50 +823,126 @@ struct FormatStyle {
 
   /// Different styles for merging short functions containing at most one
   /// statement.
-  enum ShortFunctionStyle : int8_t {
-    /// Never merge functions into a single line.
-    SFS_None,
-    /// Only merge functions defined inside a class. Same as ``inline``,
-    /// except it does not imply ``empty``: i.e. top level empty functions
-    /// are not merged either.
-    /// \code
-    ///   class Foo {
-    ///     void f() { foo(); }
-    ///   };
-    ///   void f() {
-    ///     foo();
-    ///   }
-    ///   void f() {
-    ///   }
-    /// \endcode
-    SFS_InlineOnly,
-    /// Only merge empty functions.
+  ///
+  /// They can be read as a whole for compatibility. The choices are:
+  ///
+  /// * ``None``
+  ///   Never merge functions into a single line.
+  ///
+  /// * ``InlineOnly``
+  ///   Only merge functions defined inside a class. Same as ``inline``,
+  ///   except it does not implies ``empty``: i.e. top level empty functions
+  ///   are not merged either. This option is **deprecated** and is retained
+  ///   for backwards compatibility. See ``Inline`` of ``ShortFunctionStyle``.
+  ///   \code
+  ///     class Foo {
+  ///       void f() { foo(); }
+  ///     };
+  ///     void f() {
+  ///       foo();
+  ///     }
+  ///     void f() {
+  ///     }
+  ///   \endcode
+  ///
+  /// * ``Empty``
+  ///   Only merge empty functions. This option is **deprecated** and is
+  ///   retained for backwards compatibility. See ``Empty`` of
+  ///   ``ShortFunctionStyle``.
+  ///   \code
+  ///     void f() {}
+  ///     void f2() {
+  ///       bar2();
+  ///     }
+  ///   \endcode
+  ///
+  /// * ``Inline``
+  ///   Only merge functions defined inside a class. Implies ``empty``. This
+  ///   option is **deprecated** and is retained for backwards compatibility.
+  ///   See ``Inline`` and ``Empty`` of ``ShortFunctionStyle``.
+  ///   \code
+  ///     class Foo {
+  ///       void f() { foo(); }
+  ///     };
+  ///     void f() {
+  ///       foo();
+  ///     }
+  ///     void f() {}
+  ///   \endcode
+  ///
+  /// * ``All``
+  ///   Merge all functions fitting on a single line.
+  ///   \code
+  ///     class Foo {
+  ///       void f() { foo(); }
+  ///     };
+  ///     void f() { bar(); }
+  ///   \endcode
+  ///
+  /// Also can be specified as a nested configuration flag:
+  /// \code
+  ///   # Example of usage:
+  ///   AllowShortFunctionsOnASingleLine: InlineOnly
+  ///
+  ///   # or more granular control:
+  ///   AllowShortFunctionsOnASingleLine:
+  ///     Empty: false
+  ///     Inline: true
+  ///     Other: false
+  /// \endcode
+  struct ShortFunctionStyle {
+    /// Merge top-level empty functions.
     /// \code
     ///   void f() {}
     ///   void f2() {
     ///     bar2();
     ///   }
+    ///   void f3() { /* comment */ }
     /// \endcode
-    SFS_Empty,
-    /// Only merge functions defined inside a class. Implies ``empty``.
+    bool Empty;
+    /// Merge functions defined inside a class.
     /// \code
     ///   class Foo {
     ///     void f() { foo(); }
+    ///     void g() {}
     ///   };
     ///   void f() {
     ///     foo();
     ///   }
-    ///   void f() {}
+    ///   void f() {
+    ///   }
     /// \endcode
-    SFS_Inline,
-    /// Merge all functions fitting on a single line.
+    bool Inline;
+    /// Merge all functions fitting on a single line. Please note that this
+    /// control does not include Empty
     /// \code
     ///   class Foo {
     ///     void f() { foo(); }
     ///   };
     ///   void f() { bar(); }
     /// \endcode
-    SFS_All,
+    bool Other;
+
+    bool operator==(const ShortFunctionStyle &R) const {
+      return Empty == R.Empty && Inline == R.Inline && Other == R.Other;
+    }
+    bool operator!=(const ShortFunctionStyle &R) const { return !(*this == R); }
+    ShortFunctionStyle() : Empty(false), Inline(false), Other(false) {}
+    ShortFunctionStyle(bool Empty, bool Inline, bool Other)
+        : Empty(Empty), Inline(Inline), Other(Other) {}
+    bool isAll() const { return Empty && Inline && Other; }
+    static ShortFunctionStyle setEmptyOnly() {
+      return ShortFunctionStyle(true, false, false);
+    }
+    static ShortFunctionStyle setEmptyAndInline() {
+      return ShortFunctionStyle(true, true, false);
+    }
+    static ShortFunctionStyle setInlineOnly() {
+      return ShortFunctionStyle(false, true, false);
+    }
+    static ShortFunctionStyle setAll() {
+      return ShortFunctionStyle(true, true, true);
+    }
   };
 
   /// Dependent on the value, ``int f() { return 0; }`` can be put on a
@@ -988,6 +1064,36 @@ struct FormatStyle {
   /// \version 20
   bool AllowShortNamespacesOnASingleLine;
 
+  /// Different styles for merging short records (``class``,``struct``, and
+  /// ``union``).
+  enum ShortRecordStyle : int8_t {
+    /// Never merge records into a single line.
+    SRS_Never,
+    /// Only merge empty records if the opening brace was not wrapped,
+    /// i.e. the corresponding ``BraceWrapping.After...`` option was not set.
+    SRS_EmptyAndAttached,
+    /// Only merge empty records.
+    /// \code
+    ///   struct foo {};
+    ///   struct bar
+    ///   {
+    ///     int i;
+    ///   };
+    /// \endcode
+    SRS_Empty,
+    /// Merge all records that fit on a single line.
+    /// \code
+    ///   struct foo {};
+    ///   struct bar { int i; };
+    /// \endcode
+    SRS_Always
+  };
+
+  /// Dependent on the value, ``struct bar { int i; };`` can be put on a single
+  /// line.
+  /// \version 23
+  ShortRecordStyle AllowShortRecordOnASingleLine;
+
   /// Different ways to break after the function definition return type.
   /// This option is **deprecated** and is retained for backwards compatibility.
   enum DefinitionReturnTypeBreakingStyle : int8_t {
@@ -5812,6 +5918,7 @@ struct FormatStyle {
            AllowShortLoopsOnASingleLine == R.AllowShortLoopsOnASingleLine &&
            AllowShortNamespacesOnASingleLine ==
                R.AllowShortNamespacesOnASingleLine &&
+           AllowShortRecordOnASingleLine == R.AllowShortRecordOnASingleLine &&
            AlwaysBreakBeforeMultilineStrings ==
                R.AlwaysBreakBeforeMultilineStrings &&
            AttributeMacros == R.AttributeMacros &&
diff --git a/clang/include/clang/Frontend/FrontendOptions.h b/clang/include/clang/Frontend/FrontendOptions.h
index 9e05181ac916c..f7f51bc37c98d 100644
--- a/clang/include/clang/Frontend/FrontendOptions.h
+++ b/clang/include/clang/Frontend/FrontendOptions.h
@@ -543,6 +543,21 @@ class FrontendOptions {
   /// minimization hints.
   std::string DumpMinimizationHintsPath;
 
+  /// List of SSAF extractors to enable.
+  std::vector<std::string> SSAFExtractSummaries;
+
+  /// The TU summary output file with the file extension representing the file
+  /// format.
+  std::string SSAFTUSummaryFile;
+
+  /// Show available SSAF summary extractors.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned SSAFShowExtractors : 1;
+
+  /// Show available SSAF serialization formats.
+  LLVM_PREFERRED_TYPE(bool)
+  unsigned SSAFShowFormats : 1;
+
 public:
   FrontendOptions()
       : DisableFree(false), RelocatablePCH(false), ShowHelp(false),
@@ -560,7 +575,8 @@ class FrontendOptions {
         EmitPrettySymbolGraphs(false), GenReducedBMI(false),
         UseClangIRPipeline(false), ClangIRDisablePasses(false),
         ClangIRDisableCIRVerifier(false), ClangIREnableIdiomRecognizer(false),
-        TimeTraceGranularity(500), TimeTraceVerbose(false) {}
+        TimeTraceGranularity(500), TimeTraceVerbose(false),
+        SSAFShowExtractors(false), SSAFShowFormats(false) {}
 
   /// getInputKindForExtension - Return the appropriate input kind for a file
   /// extension. For example, "c" would return Language::C.
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 8e17cd5ae15b5..6fc52384a6d1d 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -62,7 +62,7 @@ def LinkOption : OptionFlag;
 // target will lead to an err_drv_unsupported_opt_for_target error.
 def TargetSpecific : OptionFlag;
 
-// Indicates that this warning is ignored, but accepted with a warning for
+// Indicates that this option is ignored, but accepted with a warning for
 // GCC compatibility.
 class IgnoredGCCCompat : Flags<[HelpHidden]> {}
 
@@ -274,6 +274,10 @@ def StaticAnalyzer_Group : OptionGroup<"<Static analyzer group>">,
                            DocName<"Static analyzer options">, DocBrief<[{
 Flags controlling the behavior of the Clang Static Analyzer.}]>;
 
+def SSAF_Group : OptionGroup<"<ssaf options>">,
+                 DocName<"SSAF options">, DocBrief<[{
+Flags controlling the behavior of the Scalable Static Analysis Framework (SSAF).}]>;
+
 // gfortran options that we recognize in the driver and pass along when
 // invoking GCC to compile Fortran code.
 def gfortran_Group : OptionGroup<"<gfortran group>">,
@@ -784,7 +788,7 @@ def fshow_skipped_includes : Flag<["-"], "fshow-skipped-includes">,
              or #pragma once. This flag makes -H show also such includes.}]>,
   MarshallingInfoFlag<DependencyOutputOpts<"ShowSkippedHeaderIncludes">>;
 
-def I_ : Flag<["-"], "I-">, Group<I_Group>,
+def I_ : Flag<["-"], "I-">, Group<I_Group>, IgnoredGCCCompat,
     HelpText<"Restrict all prior -I flags to double-quoted inclusion and "
              "remove current directory from include path">;
 def I : JoinedOrSeparate<["-"], "I">, Group<I_Group>,
@@ -941,6 +945,34 @@ def W_Joined : Joined<["-"], "W">, Group<W_Group>,
 def Xanalyzer : Separate<["-"], "Xanalyzer">,
   HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">,
   Group<StaticAnalyzer_Group>;
+def _ssaf_extract_summaries :
+  CommaJoined<["--"], "ssaf-extract-summaries=">,
+  MetaVarName<"<summary-names>">,
+  Group<SSAF_Group>,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Comma-separated list of summary names to extract">,
+  MarshallingInfoStringVector<FrontendOpts<"SSAFExtractSummaries">>;
+def _ssaf_tu_summary_file :
+  Joined<["--"], "ssaf-tu-summary-file=">,
+  MetaVarName<"<path>.<format>">,
+  Group<SSAF_Group>,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<
+    "The output file for the extracted summaries. "
+    "The extension selects which file format to use.">,
+  MarshallingInfoString<FrontendOpts<"SSAFTUSummaryFile">>;
+def _ssaf_list_extractors :
+  Flag<["--"], "ssaf-list-extractors">,
+  Group<SSAF_Group>,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Display the list of available SSAF summary extractors">,
+  MarshallingInfoFlag<FrontendOpts<"SSAFShowExtractors">>;
+def _ssaf_list_formats :
+  Flag<["--"], "ssaf-list-formats">,
+  Group<SSAF_Group>,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Display the list of available SSAF serialization formats">,
+  MarshallingInfoFlag<FrontendOpts<"SSAFShowFormats">>;
 def Xarch__
     : JoinedAndSeparate<["-"], "Xarch_">,
       Flags<[NoXarchOption]>,
@@ -2485,7 +2517,7 @@ def fno_sanitize_coverage : CommaJoined<["-"], "fno-sanitize-coverage=">,
   Group<fsan_cov_Group>, Visibility<[ClangOption, CLOption]>,
   HelpText<"Disable features of coverage instrumentation for Sanitizers">,
   Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep,"
-         "8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters,"
+         "8bit-counters,trace-pc,trace-pc-entry-exit,trace-pc-guard,no-prune,inline-8bit-counters,"
          "inline-bool-flag">;
 def fsanitize_coverage_allowlist : Joined<["-"], "fsanitize-coverage-allowlist=">,
     Group<fsan_cov_Group>, Visibility<[ClangOption, CLOption]>,
@@ -7175,9 +7207,9 @@ def mno_gather : Flag<["-"], "mno-gather">, Group<m_Group>,
 def mno_scatter : Flag<["-"], "mno-scatter">, Group<m_Group>,
                   HelpText<"Disable generation of scatter instructions in auto-vectorization(x86 only)">;
 def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group<m_x86_Features_Group>,
-    HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">,  Visibility<[ClangOption, CLOption, FlangOption]>;
+    HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu,jmpabs">,  Visibility<[ClangOption, CLOption, FlangOption]>;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group<m_x86_Features_Group>,
-    HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu">, Visibility<[ClangOption, CLOption, FlangOption]>;
+    HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu,jmpabs">, Visibility<[ClangOption, CLOption, FlangOption]>;
 def mapxf : Flag<["-"], "mapxf">, Group<m_x86_Features_Group>;
 def mno_apxf : Flag<["-"], "mno-apxf">, Group<m_x86_Features_Group>;
 def mapx_inline_asm_use_gpr32 : Flag<["-"], "mapx-inline-asm-use-gpr32">, Group<m_Group>,
@@ -8267,6 +8299,11 @@ def fsanitize_coverage_trace_pc
       Group<fsan_cov_Group>,
       HelpText<"Enable PC tracing in sanitizer coverage">,
       MarshallingInfoFlag<CodeGenOpts<"SanitizeCoverageTracePC">>;
+def fsanitize_coverage_trace_pc_entry_exit
+    : Flag<["-"], "fsanitize-coverage-trace-pc-entry-exit">,
+      Group<fsan_cov_Group>,
+      HelpText<"Enable function entry/exit tracing in sanitizer coverage">,
+      MarshallingInfoFlag<CodeGenOpts<"SanitizeCoverageTracePCEntryExit">>;
 def fsanitize_coverage_trace_pc_guard
     : Flag<["-"], "fsanitize-coverage-trace-pc-guard">,
       Group<fsan_cov_Group>,
diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat.h b/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat.h
index 6c5303c928661..47b46cbe42698 100644
--- a/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat.h
+++ b/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat.h
@@ -28,10 +28,6 @@ class EntityIdTable;
 class EntitySummary;
 class SummaryName;
 
-/// Call this from main() to prevent the linker from dead-stripping the
-/// JSONFormat library and its static registration objects.
-void initializeJSONFormat();
-
 class JSONFormat final : public SerializationFormat {
   using Array = llvm::json::Array;
   using Object = llvm::json::Object;
diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h b/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h
index 9f83955b884e8..a1955e64d5137 100644
--- a/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h
+++ b/clang/include/clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h
@@ -24,10 +24,11 @@
 //
 // Insert this code to the cpp file:
 //
-//   LLVM_INSTANTIATE_REGISTRY(llvm::Registry<MyFormat::FormatInfo>)
-//
+//   // NOLINTNEXTLINE(misc-use-internal-linkage)
+//   volatile int SSAFMyFormatAnchorSource = 0;
 //   static SerializationFormatRegistry::Add<MyFormat>
 //     RegisterFormat("MyFormat", "My awesome serialization format");
+//   LLVM_INSTANTIATE_REGISTRY(llvm::Registry<MyFormat::FormatInfo>)
 //
 // Then implement the formatter for the specific analysis and register the
 // format info for it:
@@ -49,6 +50,15 @@
 //         "The MyFormat format info implementation for MyAnalysis"
 //       );
 //
+// Finally, insert a use of the new anchor symbol into the force-linker header:
+// clang/include/clang/ScalableStaticAnalysisFramework/SSAFBuiltinForceLinker.h:
+//
+// This anchor is used to force the linker to link the MyFormat registration.
+//
+//   extern volatile int SSAFMyFormatAnchorSource;
+//   [[maybe_unused]] static int SSAFMyFormatAnchorDestination =
+//       SSAFMyFormatAnchorSource;
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_CORE_SERIALIZATION_SERIALIZATIONFORMATREGISTRY_H
@@ -58,6 +68,7 @@
 #include "clang/Support/Compiler.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Registry.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::ssaf {
 
@@ -70,6 +81,9 @@ bool isFormatRegistered(llvm::StringRef FormatName);
 /// It's a fatal error if there is no format registered with the name.
 std::unique_ptr<SerializationFormat> makeFormat(llvm::StringRef FormatName);
 
+/// Print the list of available serialization formats.
+void printAvailableFormats(llvm::raw_ostream &OS);
+
 // Registry for adding new SerializationFormat implementations.
 using SerializationFormatRegistry = llvm::Registry<SerializationFormat>;
 
diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/Core/SummaryData/SummaryDataStore.h b/clang/include/clang/ScalableStaticAnalysisFramework/Core/SummaryData/SummaryDataStore.h
index f817dda9745c1..2d20696c65cf3 100644
--- a/clang/include/clang/ScalableStaticAnalysisFramework/Core/SummaryData/SummaryDataStore.h
+++ b/clang/include/clang/ScalableStaticAnalysisFramework/Core/SummaryData/SummaryDataStore.h
@@ -106,7 +106,7 @@ class SummaryDataStore {
     }
     auto Ptr = std::move(It->second);
     Data.erase(It);
-    return Ptr;
+    return std::move(Ptr);
   }
 };
 
diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h b/clang/include/clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h
index a49d0e5faeeb1..4f8301dcbfed4 100644
--- a/clang/include/clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h
+++ b/clang/include/clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h
@@ -9,9 +9,18 @@
 // Registry for TUSummaryExtractors, and some helper functions.
 // To register some custom extractor, insert this code:
 //
+//   // NOLINTNEXTLINE(misc-use-internal-linkage)
+//   volatile int SSAFMyExtractorAnchorSource = 0;
 //   static TUSummaryExtractorRegistry::Add<MyExtractor>
 //     X("MyExtractor", "My awesome extractor");
 //
+// Finally, insert a use of the new anchor symbol into the force-linker header:
+// clang/include/clang/ScalableStaticAnalysisFramework/SSAFBuiltinForceLinker.h:
+//
+//   extern volatile int SSAFMyExtractorAnchorSource;
+//   [[maybe_unused]] static int SSAFMyExtractorAnchorDestination =
+//       SSAFMyExtractorAnchorSource;
+//
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_CORE_TUSUMMARY_EXTRACTORREGISTRY_H
@@ -21,6 +30,7 @@
 #include "clang/Support/Compiler.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Registry.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang::ssaf {
 
@@ -34,6 +44,9 @@ bool isTUSummaryExtractorRegistered(llvm::StringRef SummaryName);
 std::unique_ptr<ASTConsumer> makeTUSummaryExtractor(llvm::StringRef SummaryName,
                                                     TUSummaryBuilder &Builder);
 
+/// Print the list of available TUSummaryExtractors.
+void printAvailableTUSummaryExtractors(llvm::raw_ostream &OS);
+
 // Registry for adding new TUSummaryExtractor implementations.
 using TUSummaryExtractorRegistry =
     llvm::Registry<TUSummaryExtractor, TUSummaryBuilder &>;
diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.h b/clang/include/clang/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.h
new file mode 100644
index 0000000000000..fe5d75149914e
--- /dev/null
+++ b/clang/include/clang/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.h
@@ -0,0 +1,33 @@
+//===- TUSummaryExtractorFrontendAction.h -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_FRONTEND_TUSUMMARYEXTRACTORFRONTENDACTION_H
+#define LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_FRONTEND_TUSUMMARYEXTRACTORFRONTENDACTION_H
+
+#include "clang/Frontend/FrontendAction.h"
+#include <memory>
+
+namespace clang::ssaf {
+
+/// Wraps the existing \c FrontendAction and injects the extractor
+/// \c ASTConsumers into the pipeline after the ASTConsumers of the wrapped
+/// action.
+class TUSummaryExtractorFrontendAction final : public WrapperFrontendAction {
+public:
+  explicit TUSummaryExtractorFrontendAction(
+      std::unique_ptr<FrontendAction> WrappedAction);
+  ~TUSummaryExtractorFrontendAction();
+
+protected:
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
+                                                 StringRef InFile) override;
+};
+
+} // namespace clang::ssaf
+
+#endif // LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_FRONTEND_TUSUMMARYEXTRACTORFRONTENDACTION_H
diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/SSAFBuiltinForceLinker.h b/clang/include/clang/ScalableStaticAnalysisFramework/SSAFBuiltinForceLinker.h
new file mode 100644
index 0000000000000..5f201487ca1fe
--- /dev/null
+++ b/clang/include/clang/ScalableStaticAnalysisFramework/SSAFBuiltinForceLinker.h
@@ -0,0 +1,28 @@
+//===- SSAFBuiltinForceLinker.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file pulls in all built-in SSAF extractor and format registrations
+/// by referencing their anchor symbols, preventing the static linker from
+/// discarding the containing object files.
+///
+/// Include this header (with IWYU pragma: keep) in any translation unit that
+/// must guarantee these registrations are active — typically the entry point
+/// of a binary that uses clangScalableStaticAnalysisFrameworkCore.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_SSAFBUILTINFORCELINKER_H
+#define LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_SSAFBUILTINFORCELINKER_H
+
+// This anchor is used to force the linker to link the JSONFormat registration.
+extern volatile int SSAFJSONFormatAnchorSource;
+[[maybe_unused]] static int SSAFJSONFormatAnchorDestination =
+    SSAFJSONFormatAnchorSource;
+
+#endif // LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_SSAFBUILTINFORCELINKER_H
diff --git a/clang/include/clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h b/clang/include/clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h
new file mode 100644
index 0000000000000..204a504c36435
--- /dev/null
+++ b/clang/include/clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h
@@ -0,0 +1,25 @@
+//===- SSAFForceLinker.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file pulls in all built-in SSAF extractor and format registrations
+/// by referencing their anchor symbols, preventing the static linker from
+/// discarding the containing object files.
+///
+/// Include this header (with IWYU pragma: keep) in any translation unit that
+/// must guarantee these registrations are active — typically the entry point
+/// of a binary that uses clangScalableStaticAnalysisFrameworkCore.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_SSAFFORCELINKER_H
+#define LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_SSAFFORCELINKER_H
+
+#include "SSAFBuiltinForceLinker.h" // IWYU pragma: keep
+
+#endif // LLVM_CLANG_SCALABLESTATICANALYSISFRAMEWORK_SSAFFORCELINKER_H
diff --git a/clang/include/clang/Sema/Template.h b/clang/include/clang/Sema/Template.h
index 0be46e69f1b6f..b0170c21feb1a 100644
--- a/clang/include/clang/Sema/Template.h
+++ b/clang/include/clang/Sema/Template.h
@@ -185,13 +185,12 @@ enum class TemplateSubstitutionKind : char {
       return !(*this)(Depth, Index).isNull();
     }
 
-    bool isAnyArgInstantiationDependent(const ASTContext &C) const {
+    bool isAnyArgInstantiationDependent() const {
       for (ArgumentListLevel ListLevel : TemplateArgumentLists)
         for (const TemplateArgument &TA : ListLevel.Args)
           // There might be null template arguments representing unused template
           // parameter mappings in an MLTAL during concept checking.
-          if (!TA.isNull() &&
-              C.getCanonicalTemplateArgument(TA).isInstantiationDependent())
+          if (!TA.isNull() && TA.isInstantiationDependent())
             return true;
       return false;
     }
diff --git a/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h b/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
index 348c7eff09161..8783ed74ee1ad 100644
--- a/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
+++ b/clang/include/clang/StaticAnalyzer/Core/CheckerManager.h
@@ -308,13 +308,15 @@ class CheckerManager {
                                  const ObjCMethodCall &msg, ExprEngine &Eng,
                                  bool wasInlined = false);
 
-  /// Run checkers for pre-visiting obj-c messages.
+  /// Run checkers for pre-visiting function calls (including methods,
+  /// constructors, destructors etc. but excluding obj-c messages).
   void runCheckersForPreCall(ExplodedNodeSet &Dst, const ExplodedNodeSet &Src,
                              const CallEvent &Call, ExprEngine &Eng) {
     runCheckersForCallEvent(/*isPreVisit=*/true, Dst, Src, Call, Eng);
   }
 
-  /// Run checkers for post-visiting obj-c messages.
+  /// Run checkers for post-visiting function calls (including methods,
+  /// constructors, destructors etc. but excluding obj-c messages).
   void runCheckersForPostCall(ExplodedNodeSet &Dst, const ExplodedNodeSet &Src,
                               const CallEvent &Call, ExprEngine &Eng,
                               bool wasInlined = false) {
@@ -322,7 +324,8 @@ class CheckerManager {
                             wasInlined);
   }
 
-  /// Run checkers for visiting obj-c messages.
+  /// Run checkers for visiting function calls (including methods,
+  /// constructors, destructors etc. but excluding obj-c messages).
   void runCheckersForCallEvent(bool isPreVisit, ExplodedNodeSet &Dst,
                                const ExplodedNodeSet &Src,
                                const CallEvent &Call, ExprEngine &Eng,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h
index cf035a999af8a..818a54eec48e5 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h
@@ -89,9 +89,7 @@ class CheckerContext {
 
   /// Returns the number of times the current block has been visited
   /// along the analyzed path.
-  unsigned blockCount() const {
-    return NB.getContext().blockCount();
-  }
+  unsigned blockCount() const { return Eng.getNumVisitedCurrent(); }
 
   ASTContext &getASTContext() {
     return Eng.getContext();
@@ -155,9 +153,7 @@ class CheckerContext {
   }
 
   /// Get the blockID.
-  unsigned getBlockID() const {
-    return NB.getContext().getBlock()->getBlockID();
-  }
+  unsigned getBlockID() const { return Eng.getCurrBlock()->getBlockID(); }
 
   /// If the given node corresponds to a PostStore program point,
   /// retrieve the location region as it was uttered in the code.
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
index 6b70fda42819c..c2135ac6f7225 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CoreEngine.h
@@ -306,7 +306,6 @@ class NodeBuilder {
 
   const ExplodedNodeSet &getResults() const { return Frontier; }
 
-  const NodeBuilderContext &getContext() const { return C; }
   bool hasGeneratedNodes() const { return HasGeneratedNodes; }
 
   void takeNodes(const ExplodedNodeSet &S) {
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 2023a7a5b1ac8..40ce9084e7f78 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -246,10 +246,15 @@ class ExprEngine {
   // This implementation is a temporary measure to allow a gradual transition.
   void setCurrLocationContextAndBlock(const LocationContext *LC,
                                       const CFGBlock *B) {
-    // Note that there is a call to resetCurrLocationContextAndBlock at the
-    // beginning of dispatchWorkItem.
+    // The current LocationContext and Block is reset at the beginning of
+    // dispacthWorkItem. Ideally, this method should be called only once per
+    // dipatchWorkItem call (= elementary analysis step); so the following
+    // assertion is there to catch accidental repeated calls. If the current
+    // LocationContext and Block needs to change in the middle of a single step
+    // (which currently happens only once, in processCallExit), use an explicit
+    // call to resetCurrLocationContextAndBlock.
     assert(!currBldrCtx && !OwnedCurrBldrCtx &&
-           "This should be called at most once per call to dispatchWorkItem");
+           "The current LocationContext and Block is already set");
     OwnedCurrBldrCtx.emplace(Engine, B, LC);
     currBldrCtx = &*OwnedCurrBldrCtx;
   }
@@ -796,7 +801,7 @@ class ExprEngine {
   /// A multi-dimensional array is also a continuous memory location in a
   /// row major order, so for arr[0][0] Idx is 0 and for arr[3][3] Idx is 8.
   SVal computeObjectUnderConstruction(const Expr *E, ProgramStateRef State,
-                                      const NodeBuilderContext *BldrCtx,
+                                      unsigned NumVisitedCaller,
                                       const LocationContext *LCtx,
                                       const ConstructionContext *CC,
                                       EvalCallOptions &CallOpts,
@@ -818,8 +823,8 @@ class ExprEngine {
       const LocationContext *LCtx, const ConstructionContext *CC,
       EvalCallOptions &CallOpts, unsigned Idx = 0) {
 
-    SVal V = computeObjectUnderConstruction(E, State, BldrCtx, LCtx, CC,
-                                            CallOpts, Idx);
+    SVal V = computeObjectUnderConstruction(E, State, BldrCtx->blockCount(),
+                                            LCtx, CC, CallOpts, Idx);
     State = updateObjectsUnderConstruction(V, E, State, LCtx, CC, CallOpts);
 
     return std::make_pair(State, V);
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 4c672a03eb855..19e5609160a99 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -13639,24 +13639,20 @@ ASTContext::getOperatorDeleteForVDtor(const CXXDestructorDecl *Dtor,
   return nullptr;
 }
 
-bool ASTContext::classNeedsVectorDeletingDestructor(const CXXRecordDecl *RD) {
+bool ASTContext::classMaybeNeedsVectorDeletingDestructor(
+    const CXXRecordDecl *RD) {
   if (!getTargetInfo().emitVectorDeletingDtors(getLangOpts()))
     return false;
-  CXXDestructorDecl *Dtor = RD->getDestructor();
-  // The compiler can't know if new[]/delete[] will be used outside of the DLL,
-  // so just force vector deleting destructor emission if dllexport is present.
-  // This matches MSVC behavior.
-  if (Dtor && Dtor->isVirtual() && Dtor->hasAttr<DLLExportAttr>())
-    return true;
 
-  return RequireVectorDeletingDtor.count(RD);
+  return MaybeRequireVectorDeletingDtor.count(RD);
 }
 
-void ASTContext::setClassNeedsVectorDeletingDestructor(
+void ASTContext::setClassMaybeNeedsVectorDeletingDestructor(
     const CXXRecordDecl *RD) {
   if (!getTargetInfo().emitVectorDeletingDtors(getLangOpts()))
     return;
-  RequireVectorDeletingDtor.insert(RD);
+
+  MaybeRequireVectorDeletingDtor.insert(RD);
 }
 
 MangleNumberingContext &
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 13d6dcc92356d..fc2da6362d0bb 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -5776,10 +5776,12 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   if (DiscardResult)
     return true;
 
-  if (this->LambdaThisCapture.Offset > 0) {
-    if (this->LambdaThisCapture.IsPtr)
-      return this->emitGetThisFieldPtr(this->LambdaThisCapture.Offset, E);
-    return this->emitGetPtrThisField(this->LambdaThisCapture.Offset, E);
+  if constexpr (!std::is_same_v<Emitter, EvalEmitter>) {
+    if (this->LambdaThisCapture.Offset > 0) {
+      if (this->LambdaThisCapture.IsPtr)
+        return this->emitGetThisFieldPtr(this->LambdaThisCapture.Offset, E);
+      return this->emitGetPtrThisField(this->LambdaThisCapture.Offset, E);
+    }
   }
 
   // In some circumstances, the 'this' pointer does not actually refer to the
@@ -6113,11 +6115,13 @@ bool Compiler<Emitter>::visitWhileStmt(const WhileStmt *S) {
   this->fallthrough(CondLabel);
   this->emitLabel(CondLabel);
 
-  {
-    LocalScope<Emitter> CondScope(this);
-    if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt())
-      if (!visitDeclStmt(CondDecl))
-        return false;
+  // Start of the loop body {
+  LocalScope<Emitter> CondScope(this);
+
+  if (const DeclStmt *CondDecl = S->getConditionVariableDeclStmt()) {
+    if (!visitDeclStmt(CondDecl))
+      return false;
+  }
 
     if (!this->visitBool(Cond))
       return false;
@@ -6133,12 +6137,14 @@ bool Compiler<Emitter>::visitWhileStmt(const WhileStmt *S) {
 
     if (!CondScope.destroyLocals())
       return false;
-  }
-  if (!this->jump(CondLabel))
-    return false;
-  this->fallthrough(EndLabel);
-  this->emitLabel(EndLabel);
-  return WholeLoopScope.destroyLocals();
+    // } End of loop body.
+
+    if (!this->jump(CondLabel))
+      return false;
+    this->fallthrough(EndLabel);
+    this->emitLabel(EndLabel);
+
+    return CondScope.destroyLocals() && WholeLoopScope.destroyLocals();
 }
 
 template <class Emitter> bool Compiler<Emitter>::visitDoStmt(const DoStmt *S) {
@@ -7491,14 +7497,16 @@ bool Compiler<Emitter>::visitDeclRef(const ValueDecl *D, const Expr *E) {
     return this->visitDeclRef(D, E);
   };
 
-  // Lambda captures.
-  if (auto It = this->LambdaCaptures.find(D);
-      It != this->LambdaCaptures.end()) {
-    auto [Offset, IsPtr] = It->second;
+  if constexpr (!std::is_same_v<Emitter, EvalEmitter>) {
+    // Lambda captures.
+    if (auto It = this->LambdaCaptures.find(D);
+        It != this->LambdaCaptures.end()) {
+      auto [Offset, IsPtr] = It->second;
 
-    if (IsPtr)
-      return this->emitGetThisFieldPtr(Offset, E);
-    return this->emitGetPtrThisField(Offset, E);
+      if (IsPtr)
+        return this->emitGetThisFieldPtr(Offset, E);
+      return this->emitGetPtrThisField(Offset, E);
+    }
   }
 
   if (const auto *DRE = dyn_cast<DeclRefExpr>(E);
diff --git a/clang/lib/AST/ByteCode/Disasm.cpp b/clang/lib/AST/ByteCode/Disasm.cpp
index 35937e3483e38..5fd15f8a2f0d1 100644
--- a/clang/lib/AST/ByteCode/Disasm.cpp
+++ b/clang/lib/AST/ByteCode/Disasm.cpp
@@ -16,7 +16,6 @@
 #include "FixedPoint.h"
 #include "Floating.h"
 #include "Function.h"
-#include "FunctionPointer.h"
 #include "Integral.h"
 #include "IntegralAP.h"
 #include "InterpFrame.h"
diff --git a/clang/lib/AST/ByteCode/EvalEmitter.h b/clang/lib/AST/ByteCode/EvalEmitter.h
index a9f87db5d7f8d..f5c51c5f3dfa0 100644
--- a/clang/lib/AST/ByteCode/EvalEmitter.h
+++ b/clang/lib/AST/ByteCode/EvalEmitter.h
@@ -92,10 +92,6 @@ class EvalEmitter : public SourceMapper {
 
   /// Parameter indices.
   llvm::DenseMap<const ParmVarDecl *, ParamOffset> Params;
-  /// Lambda captures.
-  llvm::DenseMap<const ValueDecl *, ParamOffset> LambdaCaptures;
-  /// Offset of the This parameter in a lambda record.
-  ParamOffset LambdaThisCapture{0, false};
   /// Local descriptors.
   llvm::SmallVector<SmallVector<Local, 8>, 2> Descriptors;
   std::optional<SourceInfo> LocOverride = std::nullopt;
diff --git a/clang/lib/AST/ByteCode/Function.h b/clang/lib/AST/ByteCode/Function.h
index 544172b7e0c26..f67eabf1f27b9 100644
--- a/clang/lib/AST/ByteCode/Function.h
+++ b/clang/lib/AST/ByteCode/Function.h
@@ -327,7 +327,8 @@ class Function final {
 
 public:
   /// Dumps the disassembled bytecode to \c llvm::errs().
-  void dump(CodePtr PC = {}) const;
+  void dump() const { dump({}); }
+  void dump(CodePtr PC) const;
   void dump(llvm::raw_ostream &OS, CodePtr PC = {}) const;
 };
 
diff --git a/clang/lib/AST/ByteCode/FunctionPointer.cpp b/clang/lib/AST/ByteCode/FunctionPointer.cpp
deleted file mode 100644
index 2488626697284..0000000000000
--- a/clang/lib/AST/ByteCode/FunctionPointer.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-//===----------------------- FunctionPointer.cpp ----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "FunctionPointer.h"
-
-namespace clang {
-namespace interp {
-
-APValue FunctionPointer::toAPValue(const ASTContext &) const {
-  if (!Func)
-    return APValue(static_cast<Expr *>(nullptr), CharUnits::Zero(), {},
-                   /*OnePastTheEnd=*/false, /*IsNull=*/true);
-
-  if (Func->getDecl())
-    return APValue(Func->getDecl(), CharUnits::fromQuantity(0), {},
-                   /*OnePastTheEnd=*/false, /*IsNull=*/false);
-  return APValue(Func->getExpr(), CharUnits::fromQuantity(0), {},
-                 /*OnePastTheEnd=*/false, /*IsNull=*/false);
-}
-
-void FunctionPointer::print(llvm::raw_ostream &OS) const {
-  OS << "FnPtr(";
-  if (Func)
-    OS << Func->getName();
-  else
-    OS << "nullptr";
-  OS << ")";
-}
-
-} // namespace interp
-} // namespace clang
diff --git a/clang/lib/AST/ByteCode/FunctionPointer.h b/clang/lib/AST/ByteCode/FunctionPointer.h
deleted file mode 100644
index 9e8ea2f1af5f8..0000000000000
--- a/clang/lib/AST/ByteCode/FunctionPointer.h
+++ /dev/null
@@ -1,55 +0,0 @@
-//===--- FunctionPointer.h - Types for the constexpr VM ---------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_AST_INTERP_FUNCTION_POINTER_H
-#define LLVM_CLANG_AST_INTERP_FUNCTION_POINTER_H
-
-#include "Function.h"
-#include "Primitives.h"
-
-namespace clang {
-class ASTContext;
-class APValue;
-namespace interp {
-
-class FunctionPointer final {
-private:
-  const Function *Func;
-
-public:
-  FunctionPointer() = default;
-  FunctionPointer(const Function *Func) : Func(Func) {}
-
-  const Function *getFunction() const { return Func; }
-  bool isZero() const { return !Func; }
-  bool isWeak() const {
-    if (!Func || !Func->getDecl())
-      return false;
-
-    return Func->getDecl()->isWeak();
-  }
-
-  APValue toAPValue(const ASTContext &) const;
-  void print(llvm::raw_ostream &OS) const;
-
-  std::string toDiagnosticString(const ASTContext &Ctx) const {
-    if (!Func)
-      return "nullptr";
-
-    return toAPValue(Ctx).getAsString(Ctx, Func->getDecl()->getType());
-  }
-
-  uint64_t getIntegerRepresentation() const {
-    return static_cast<uint64_t>(reinterpret_cast<uintptr_t>(Func));
-  }
-};
-
-} // namespace interp
-} // namespace clang
-
-#endif
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index ebc7220aa5671..3b92c1858126a 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1903,8 +1903,7 @@ bool CallPtr(InterpState &S, CodePtr OpPC, uint32_t ArgSize,
   if (!Ptr.isFunctionPointer())
     return Invalid(S, OpPC);
 
-  const FunctionPointer &FuncPtr = Ptr.asFunctionPointer();
-  const Function *F = FuncPtr.getFunction();
+  const Function *F = Ptr.asFunctionPointer().Func;
   assert(F);
   // Don't allow calling block pointers.
   if (!F->getDecl())
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index 7b8a7c80c5423..01ea334f65cd3 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -2310,7 +2310,7 @@ std::optional<Pointer> OffsetHelper(InterpState &S, CodePtr OpPC,
     if (N > 1)
       S.CCEDiag(S.Current->getSource(OpPC), diag::note_constexpr_array_index)
           << N << /*non-array*/ true << 0;
-    return Pointer(Ptr.asFunctionPointer().getFunction(), N);
+    return Pointer(Ptr.asFunctionPointer().Func, N);
   } else if (!Ptr.isBlockPointer()) {
     return std::nullopt;
   }
diff --git a/clang/lib/AST/ByteCode/MemberPointer.cpp b/clang/lib/AST/ByteCode/MemberPointer.cpp
index c821b0fc689fd..f0c1fe5930261 100644
--- a/clang/lib/AST/ByteCode/MemberPointer.cpp
+++ b/clang/lib/AST/ByteCode/MemberPointer.cpp
@@ -8,7 +8,6 @@
 
 #include "MemberPointer.h"
 #include "Context.h"
-#include "FunctionPointer.h"
 #include "Program.h"
 #include "Record.h"
 
@@ -76,11 +75,6 @@ std::optional<Pointer> MemberPointer::toPointer(const Context &Ctx) const {
   return Pointer(const_cast<Block *>(Base.block()), Offset, Offset);
 }
 
-FunctionPointer MemberPointer::toFunctionPointer(const Context &Ctx) const {
-  return FunctionPointer(
-      Ctx.getProgram().getFunction(cast<FunctionDecl>(getDecl())));
-}
-
 APValue MemberPointer::toAPValue(const ASTContext &ASTCtx) const {
   if (isZero())
     return APValue(static_cast<ValueDecl *>(nullptr), /*IsDerivedMember=*/false,
diff --git a/clang/lib/AST/ByteCode/MemberPointer.h b/clang/lib/AST/ByteCode/MemberPointer.h
index a9b95471038e0..7bef2accd39e5 100644
--- a/clang/lib/AST/ByteCode/MemberPointer.h
+++ b/clang/lib/AST/ByteCode/MemberPointer.h
@@ -19,7 +19,6 @@ class CXXRecordDecl;
 namespace interp {
 
 class Context;
-class FunctionPointer;
 
 class MemberPointer final {
 private:
@@ -100,7 +99,6 @@ class MemberPointer final {
   ComparisonCategoryResult compare(const MemberPointer &RHS) const;
 
   std::optional<Pointer> toPointer(const Context &Ctx) const;
-  FunctionPointer toFunctionPointer(const Context &Ctx) const;
 
   bool isBaseCastPossible() const {
     if (PtrOffset < 0)
diff --git a/clang/lib/AST/ByteCode/Pointer.cpp b/clang/lib/AST/ByteCode/Pointer.cpp
index 740a72ca241a8..689407e60c542 100644
--- a/clang/lib/AST/ByteCode/Pointer.cpp
+++ b/clang/lib/AST/ByteCode/Pointer.cpp
@@ -182,11 +182,10 @@ APValue Pointer::toAPValue(const ASTContext &ASTCtx) const {
                    /*IsOnePastEnd=*/false, /*IsNullPtr=*/false);
   if (isFunctionPointer()) {
     const FunctionPointer &FP = asFunctionPointer();
-    if (const FunctionDecl *FD = FP.getFunction()->getDecl())
+    if (const FunctionDecl *FD = FP.Func->getDecl())
       return APValue(FD, CharUnits::fromQuantity(Offset), {},
                      /*OnePastTheEnd=*/false, /*IsNull=*/false);
-    return APValue(FP.getFunction()->getExpr(), CharUnits::fromQuantity(Offset),
-                   {},
+    return APValue(FP.Func->getExpr(), CharUnits::fromQuantity(Offset), {},
                    /*OnePastTheEnd=*/false, /*IsNull=*/false);
   }
 
@@ -352,8 +351,7 @@ void Pointer::print(llvm::raw_ostream &OS) const {
     OS << "}";
     break;
   case Storage::Fn:
-    OS << "(Fn) { " << asFunctionPointer().getFunction() << " + " << Offset
-       << " }";
+    OS << "(Fn) { " << Fn.Func << " + " << Offset << " }";
     break;
   case Storage::Typeid:
     OS << "(Typeid) { " << (const void *)asTypeidPointer().TypePtr << ", "
@@ -376,7 +374,7 @@ size_t Pointer::computeOffsetForComparison(const ASTContext &ASTCtx) const {
     // See below.
     break;
   case Storage::Fn:
-    return Fn.getIntegerRepresentation() + Offset;
+    return getIntegerRepresentation();
   case Storage::Typeid:
     return reinterpret_cast<uintptr_t>(asTypeidPointer().TypePtr) + Offset;
   }
@@ -438,9 +436,6 @@ std::string Pointer::toDiagnosticString(const ASTContext &Ctx) const {
   if (isIntegralPointer())
     return (Twine("&(") + Twine(asIntPointer().Value + Offset) + ")").str();
 
-  if (isFunctionPointer())
-    return asFunctionPointer().toDiagnosticString(Ctx);
-
   return toAPValue(Ctx).getAsString(Ctx, getType());
 }
 
diff --git a/clang/lib/AST/ByteCode/Pointer.h b/clang/lib/AST/ByteCode/Pointer.h
index 010e917de81b2..ea9c7d4cb04db 100644
--- a/clang/lib/AST/ByteCode/Pointer.h
+++ b/clang/lib/AST/ByteCode/Pointer.h
@@ -14,7 +14,7 @@
 #define LLVM_CLANG_AST_INTERP_POINTER_H
 
 #include "Descriptor.h"
-#include "FunctionPointer.h"
+#include "Function.h"
 #include "InitMap.h"
 #include "InterpBlock.h"
 #include "clang/AST/ComparisonCategories.h"
@@ -53,6 +53,10 @@ struct IntPointer {
   IntPointer baseCast(const ASTContext &ASTCtx, unsigned BaseOffset) const;
 };
 
+struct FunctionPointer {
+  const Function *Func;
+};
+
 struct TypeidPointer {
   const Type *TypePtr;
   const Type *TypeInfoType;
@@ -106,7 +110,7 @@ class Pointer {
   Pointer(uint64_t Address, const Descriptor *Desc, uint64_t Offset = 0)
       : Offset(Offset), StorageKind(Storage::Int), Int{Desc, Address} {}
   Pointer(const Function *F, uint64_t Offset = 0)
-      : Offset(Offset), StorageKind(Storage::Fn), Fn(F) {}
+      : Offset(Offset), StorageKind(Storage::Fn), Fn{F} {}
   Pointer(const Type *TypePtr, const Type *TypeInfoType, uint64_t Offset = 0)
       : Offset(Offset), StorageKind(Storage::Typeid) {
     Typeid.TypePtr = TypePtr;
@@ -127,7 +131,7 @@ class Pointer {
              P.Offset == Offset;
 
     if (isFunctionPointer())
-      return P.Fn.getFunction() == Fn.getFunction() && P.Offset == Offset;
+      return P.Fn.Func == Fn.Func && P.Offset == Offset;
 
     assert(isBlockPointer());
     return P.BS.Pointee == BS.Pointee && P.BS.Base == BS.Base &&
@@ -146,7 +150,7 @@ class Pointer {
     if (isIntegralPointer())
       return Int.Value + (Offset * elemSize());
     if (isFunctionPointer())
-      return Fn.getIntegerRepresentation() + Offset;
+      return reinterpret_cast<uint64_t>(Fn.Func) + Offset;
     return reinterpret_cast<uint64_t>(BS.Pointee) + Offset;
   }
 
@@ -159,7 +163,7 @@ class Pointer {
     if (isIntegralPointer())
       return Pointer(Int.Value, Int.Desc, Idx);
     if (isFunctionPointer())
-      return Pointer(Fn.getFunction(), Idx);
+      return Pointer(Fn.Func, Idx);
 
     if (BS.Base == RootPtrMark)
       return Pointer(BS.Pointee, RootPtrMark, getDeclDesc()->getSize());
@@ -264,7 +268,7 @@ class Pointer {
     case Storage::Block:
       return BS.Pointee == nullptr;
     case Storage::Fn:
-      return Fn.isZero();
+      return !Fn.Func;
     case Storage::Typeid:
       return false;
     }
@@ -302,7 +306,7 @@ class Pointer {
     if (isBlockPointer())
       return getDeclDesc()->getSource();
     if (isFunctionPointer()) {
-      const Function *F = Fn.getFunction();
+      const Function *F = Fn.Func;
       return F ? F->getDecl() : DeclTy();
     }
     assert(isIntegralPointer());
@@ -343,7 +347,7 @@ class Pointer {
     if (isTypeidPointer())
       return QualType(Typeid.TypeInfoType, 0);
     if (isFunctionPointer())
-      return Fn.getFunction()->getDecl()->getType();
+      return Fn.Func->getDecl()->getType();
 
     if (inPrimitiveArray() && Offset != BS.Base) {
       // Unfortunately, complex and vector types are not array types in clang,
@@ -531,8 +535,12 @@ class Pointer {
   }
 
   bool isWeak() const {
-    if (isFunctionPointer())
-      return Fn.isWeak();
+    if (isFunctionPointer()) {
+      if (!Fn.Func || !Fn.Func->getDecl())
+        return false;
+
+      return Fn.Func->getDecl()->isWeak();
+    }
     if (!isBlockPointer())
       return false;
 
diff --git a/clang/lib/AST/ByteCode/PrimType.h b/clang/lib/AST/ByteCode/PrimType.h
index e29d6650a1495..2fa553b7b4a47 100644
--- a/clang/lib/AST/ByteCode/PrimType.h
+++ b/clang/lib/AST/ByteCode/PrimType.h
@@ -24,7 +24,6 @@ namespace interp {
 class Pointer;
 class Boolean;
 class Floating;
-class FunctionPointer;
 class MemberPointer;
 class FixedPoint;
 template <bool Signed> class IntegralAP;
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index 0e0a0c94d2ac5..a2eac60844ef8 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -76,7 +76,6 @@ add_clang_library(clangAST
   ByteCode/Disasm.cpp
   ByteCode/EvalEmitter.cpp
   ByteCode/Function.cpp
-  ByteCode/FunctionPointer.cpp
   ByteCode/InterpBuiltin.cpp
   ByteCode/InterpBuiltinBitCast.cpp
   ByteCode/Floating.cpp
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 6f88a428b1230..cb941c94c84a7 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -451,6 +451,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasCF = true;
     } else if (Feature == "+zu") {
       HasZU = true;
+    } else if (Feature == "+jmpabs") {
+      HasJMPABS = true;
     } else if (Feature == "+branch-hint") {
       HasBranchHint = true;
     }
@@ -975,7 +977,9 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__CF__");
   if (HasZU)
     Builder.defineMacro("__ZU__");
-  if (HasEGPR && HasNDD && HasCCMP && HasNF && HasZU)
+  if (HasJMPABS)
+    Builder.defineMacro("__JMPABS__");
+  if (HasEGPR && HasNDD && HasCCMP && HasNF && HasZU && HasJMPABS)
     if (getTriple().isOSWindows() || (HasPush2Pop2 && HasPPX))
       Builder.defineMacro("__APX_F__");
   if (HasEGPR && HasInlineAsmUseGPR32)
@@ -1177,6 +1181,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("nf", true)
       .Case("cf", true)
       .Case("zu", true)
+      .Case("jmpabs", true)
       .Default(false);
 }
 
@@ -1300,6 +1305,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("nf", HasNF)
       .Case("cf", HasCF)
       .Case("zu", HasZU)
+      .Case("jmpabs", HasJMPABS)
       .Case("branch-hint", HasBranchHint)
       .Default(false);
 }
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index f99bbf363458f..20090757c10c7 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -177,6 +177,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasNF = false;
   bool HasCF = false;
   bool HasZU = false;
+  bool HasJMPABS = false;
   bool HasInlineAsmUseGPR32 = false;
   bool HasBranchHint = false;
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 7cd1bdcf491be..8bfdbebb2c51f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -441,8 +441,7 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
       // Source is a unsigned integer: first cast it to signed.
       if (intTy.isUnsigned())
         value = createIntCast(value, getSIntNTy(intTy.getWidth()));
-      return cir::UnaryOp::create(*this, value.getLoc(), value.getType(),
-                                  cir::UnaryOpKind::Minus, value);
+      return createMinus(value.getLoc(), value);
     }
 
     llvm_unreachable("negation for the given type is NYI");
@@ -456,8 +455,7 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     assert(!cir::MissingFeatures::fpConstraints());
     assert(!cir::MissingFeatures::fastMathFlags());
 
-    return cir::UnaryOp::create(*this, value.getLoc(), value.getType(),
-                                cir::UnaryOpKind::Minus, value);
+    return createMinus(value.getLoc(), value);
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 5d9ddbe5c3f22..f155365ab5de2 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -191,8 +191,7 @@ static RValue emitBinaryAtomicPost(CIRGenFunction &cgf,
   result = BinOp::create(builder, result.getLoc(), result, emittedArgValue);
 
   if (invert)
-    result = cir::UnaryOp::create(builder, result.getLoc(),
-                                  cir::UnaryOpKind::Not, result);
+    result = builder.createNot(result);
 
   result = emitFromInt(cgf, result, typ, originalArgType);
   return RValue::get(result);
@@ -1094,8 +1093,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BIconjf:
   case Builtin::BIconjl: {
     mlir::Value complex = emitComplexExpr(e->getArg(0));
-    mlir::Value conj = builder.createUnaryOp(getLoc(e->getExprLoc()),
-                                             cir::UnaryOpKind::Not, complex);
+    mlir::Value conj = builder.createNot(complex);
     return RValue::getComplex(conj);
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
index 5534e69b5f8bc..5d7b8d839fa84 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinAArch64.cpp
@@ -1390,6 +1390,8 @@ static mlir::Value emitCommonNeonSISDBuiltinExpr(
     break;
   case NEON::BI__builtin_neon_vabdd_f64:
   case NEON::BI__builtin_neon_vabds_f32:
+  case NEON::BI__builtin_neon_vshld_s64:
+  case NEON::BI__builtin_neon_vshld_u64:
     return emitNeonCall(cgf.cgm, cgf.getBuilder(),
                         {cgf.convertType(expr->getArg(0)->getType())}, ops,
                         llvmIntrName, cgf.convertType(expr->getType()), loc);
@@ -2785,8 +2787,18 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vrshrd_n_s64:
   case NEON::BI__builtin_neon_vrsrad_n_u64:
   case NEON::BI__builtin_neon_vrsrad_n_s64:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case NEON::BI__builtin_neon_vshld_n_s64:
-  case NEON::BI__builtin_neon_vshld_n_u64:
+  case NEON::BI__builtin_neon_vshld_n_u64: {
+    auto loc = getLoc(expr->getExprLoc());
+    std::optional<llvm::APSInt> amt =
+        expr->getArg(1)->getIntegerConstantExpr(getContext());
+    assert(amt && "Expected argument to be a constant");
+    return builder.createShiftLeft(loc, ops[0], amt->getZExtValue());
+  }
   case NEON::BI__builtin_neon_vshrd_n_s64:
   case NEON::BI__builtin_neon_vshrd_n_u64:
   case NEON::BI__builtin_neon_vsrad_n_s64:
@@ -2802,10 +2814,22 @@ CIRGenFunction::emitAArch64BuiltinExpr(unsigned builtinID, const CallExpr *expr,
   case NEON::BI__builtin_neon_vqdmlsls_lane_s32:
   case NEON::BI__builtin_neon_vqdmlsls_laneq_s32:
   case NEON::BI__builtin_neon_vget_lane_bf16:
-  case NEON::BI__builtin_neon_vduph_lane_bf16:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vduph_lane_bf16: {
+    return cir::VecExtractOp::create(builder, loc, ops[0], ops[1]);
+  }
   case NEON::BI__builtin_neon_vduph_lane_f16:
   case NEON::BI__builtin_neon_vgetq_lane_bf16:
-  case NEON::BI__builtin_neon_vduph_laneq_bf16:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented AArch64 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
+  case NEON::BI__builtin_neon_vduph_laneq_bf16: {
+    return cir::VecExtractOp::create(builder, loc, ops[0], ops[1]);
+  }
   case NEON::BI__builtin_neon_vduph_laneq_f16:
   case NEON::BI__builtin_neon_vcvt_bf16_f32:
   case NEON::BI__builtin_neon_vcvtq_low_bf16_f32:
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 1f08ba773dfb5..5328bb0a812a5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -218,7 +218,7 @@ Address CIRGenFunction::emitPointerWithAlignment(const Expr *expr,
 
   // Unary &
   if (const UnaryOperator *uo = dyn_cast<UnaryOperator>(expr)) {
-    // TODO(cir): maybe we should use cir.unary for pointers here instead.
+    // TODO(cir): maybe we should use a CIR unary op for pointers here instead.
     if (uo->getOpcode() == UO_AddrOf) {
       LValue lv = emitLValue(uo->getSubExpr());
       if (baseInfo)
@@ -1063,17 +1063,14 @@ LValue CIRGenFunction::emitUnaryOpLValue(const UnaryOperator *e) {
   }
   case UO_PreInc:
   case UO_PreDec: {
-    cir::UnaryOpKind kind =
-        e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec;
     LValue lv = emitLValue(e->getSubExpr());
 
     assert(e->isPrefix() && "Prefix operator in unexpected state!");
 
-    if (e->getType()->isAnyComplexType()) {
-      emitComplexPrePostIncDec(e, lv, kind, /*isPre=*/true);
-    } else {
-      emitScalarPrePostIncDec(e, lv, kind, /*isPre=*/true);
-    }
+    if (e->getType()->isAnyComplexType())
+      emitComplexPrePostIncDec(e, lv);
+    else
+      emitScalarPrePostIncDec(e, lv);
 
     return lv;
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index 00d69cdf908f9..0829aa56a5cd4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -173,29 +173,28 @@ class ComplexExprEmitter : public StmtVisitor<ComplexExprEmitter, mlir::Value> {
   mlir::Value VisitStmtExpr(const StmtExpr *e);
 
   // Operators.
-  mlir::Value VisitPrePostIncDec(const UnaryOperator *e, cir::UnaryOpKind op,
-                                 bool isPre) {
+  mlir::Value VisitPrePostIncDec(const UnaryOperator *e) {
     LValue lv = cgf.emitLValue(e->getSubExpr());
-    return cgf.emitComplexPrePostIncDec(e, lv, op, isPre);
+    return cgf.emitComplexPrePostIncDec(e, lv);
   }
   mlir::Value VisitUnaryPostDec(const UnaryOperator *e) {
-    return VisitPrePostIncDec(e, cir::UnaryOpKind::Dec, false);
+    return VisitPrePostIncDec(e);
   }
   mlir::Value VisitUnaryPostInc(const UnaryOperator *e) {
-    return VisitPrePostIncDec(e, cir::UnaryOpKind::Inc, false);
+    return VisitPrePostIncDec(e);
   }
   mlir::Value VisitUnaryPreDec(const UnaryOperator *e) {
-    return VisitPrePostIncDec(e, cir::UnaryOpKind::Dec, true);
+    return VisitPrePostIncDec(e);
   }
   mlir::Value VisitUnaryPreInc(const UnaryOperator *e) {
-    return VisitPrePostIncDec(e, cir::UnaryOpKind::Inc, true);
+    return VisitPrePostIncDec(e);
   }
   mlir::Value VisitUnaryDeref(const Expr *e) { return emitLoadOfLValue(e); }
 
   mlir::Value VisitUnaryPlus(const UnaryOperator *e);
+  mlir::Value VisitUnaryPlus(const UnaryOperator *e, QualType promotionType);
   mlir::Value VisitUnaryMinus(const UnaryOperator *e);
-  mlir::Value VisitPlusMinus(const UnaryOperator *e, cir::UnaryOpKind kind,
-                             QualType promotionType);
+  mlir::Value VisitUnaryMinus(const UnaryOperator *e, QualType promotionType);
   mlir::Value VisitUnaryNot(const UnaryOperator *e);
   // LNot,Real,Imag never return complex.
   mlir::Value VisitUnaryExtension(const UnaryOperator *e) {
@@ -573,32 +572,35 @@ mlir::Value ComplexExprEmitter::emitCast(CastKind ck, Expr *op,
 
 mlir::Value ComplexExprEmitter::VisitUnaryPlus(const UnaryOperator *e) {
   QualType promotionTy = getPromotionType(e->getSubExpr()->getType());
-  mlir::Value result = VisitPlusMinus(e, cir::UnaryOpKind::Plus, promotionTy);
+  mlir::Value result = VisitUnaryPlus(e, promotionTy);
   if (!promotionTy.isNull())
     return cgf.emitUnPromotedValue(result, e->getSubExpr()->getType());
   return result;
 }
 
+mlir::Value ComplexExprEmitter::VisitUnaryPlus(const UnaryOperator *e,
+                                               QualType promotionType) {
+  if (!promotionType.isNull())
+    return cgf.emitPromotedComplexExpr(e->getSubExpr(), promotionType);
+  return Visit(e->getSubExpr());
+}
+
 mlir::Value ComplexExprEmitter::VisitUnaryMinus(const UnaryOperator *e) {
   QualType promotionTy = getPromotionType(e->getSubExpr()->getType());
-  mlir::Value result = VisitPlusMinus(e, cir::UnaryOpKind::Minus, promotionTy);
+  mlir::Value result = VisitUnaryMinus(e, promotionTy);
   if (!promotionTy.isNull())
     return cgf.emitUnPromotedValue(result, e->getSubExpr()->getType());
   return result;
 }
 
-mlir::Value ComplexExprEmitter::VisitPlusMinus(const UnaryOperator *e,
-                                               cir::UnaryOpKind kind,
-                                               QualType promotionType) {
-  assert((kind == cir::UnaryOpKind::Plus || kind == cir::UnaryOpKind::Minus) &&
-         "Invalid UnaryOp kind for ComplexType Plus or Minus");
-
+mlir::Value ComplexExprEmitter::VisitUnaryMinus(const UnaryOperator *e,
+                                                QualType promotionType) {
   mlir::Value op;
   if (!promotionType.isNull())
     op = cgf.emitPromotedComplexExpr(e->getSubExpr(), promotionType);
   else
     op = Visit(e->getSubExpr());
-  return builder.createUnaryOp(cgf.getLoc(e->getExprLoc()), kind, op);
+  return builder.createMinus(cgf.getLoc(e->getExprLoc()), op);
 }
 
 mlir::Value ComplexExprEmitter::VisitUnaryNot(const UnaryOperator *e) {
@@ -765,12 +767,10 @@ mlir::Value ComplexExprEmitter::emitPromoted(const Expr *e,
     }
   } else if (const auto *unaryOp = dyn_cast<UnaryOperator>(e)) {
     switch (unaryOp->getOpcode()) {
+    case UO_Plus:
+      return VisitUnaryPlus(unaryOp, promotionTy);
     case UO_Minus:
-    case UO_Plus: {
-      auto kind = unaryOp->getOpcode() == UO_Plus ? cir::UnaryOpKind::Plus
-                                                  : cir::UnaryOpKind::Minus;
-      return VisitPlusMinus(unaryOp, kind, promotionTy);
-    }
+      return VisitUnaryMinus(unaryOp, promotionTy);
     default:
       break;
     }
@@ -1092,15 +1092,11 @@ LValue CIRGenFunction::emitComplexCompoundAssignmentLValue(
 }
 
 mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e,
-                                                     LValue lv,
-                                                     cir::UnaryOpKind op,
-                                                     bool isPre) {
-  assert((op == cir::UnaryOpKind::Inc || op == cir::UnaryOpKind::Dec) &&
-         "Invalid UnaryOp kind for ComplexType");
-
+                                                     LValue lv) {
   mlir::Value inVal = emitLoadOfComplex(lv, e->getExprLoc());
   mlir::Location loc = getLoc(e->getExprLoc());
-  mlir::Value incVal = builder.createUnaryOp(loc, op, inVal);
+  mlir::Value incVal = e->isIncrementOp() ? builder.createInc(loc, inVal)
+                                          : builder.createDec(loc, inVal);
 
   // Store the updated result through the lvalue.
   emitStoreOfComplex(loc, incVal, lv, /*isInit=*/false);
@@ -1110,7 +1106,7 @@ mlir::Value CIRGenFunction::emitComplexPrePostIncDec(const UnaryOperator *e,
 
   // If this is a postinc, return the value read from memory, otherwise use the
   // updated value.
-  return isPre ? incVal : inVal;
+  return e->isPrefix() ? incVal : inVal;
 }
 
 LValue CIRGenFunction::emitScalarCompoundAssignWithComplex(
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 4a2973d3824ee..bef487d95156f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -586,24 +586,23 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   VisitAbstractConditionalOperator(const AbstractConditionalOperator *e);
 
   // Unary Operators.
-  mlir::Value VisitUnaryPostDec(const UnaryOperator *e) {
+  mlir::Value VisitUnaryPrePostIncDec(const UnaryOperator *e) {
     LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Dec, false);
+    return emitScalarPrePostIncDec(e, lv);
+  }
+  mlir::Value VisitUnaryPostDec(const UnaryOperator *e) {
+    return VisitUnaryPrePostIncDec(e);
   }
   mlir::Value VisitUnaryPostInc(const UnaryOperator *e) {
-    LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Inc, false);
+    return VisitUnaryPrePostIncDec(e);
   }
   mlir::Value VisitUnaryPreDec(const UnaryOperator *e) {
-    LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Dec, true);
+    return VisitUnaryPrePostIncDec(e);
   }
   mlir::Value VisitUnaryPreInc(const UnaryOperator *e) {
-    LValue lv = cgf.emitLValue(e->getSubExpr());
-    return emitScalarPrePostIncDec(e, lv, cir::UnaryOpKind::Inc, true);
+    return VisitUnaryPrePostIncDec(e);
   }
-  mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
-                                      cir::UnaryOpKind kind, bool isPre) {
+  mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv) {
     if (cgf.getLangOpts().OpenMP)
       cgf.cgm.errorNYI(e->getSourceRange(), "inc/dec OpenMP");
 
@@ -632,7 +631,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     //          -> bool = ((int)bool + 1 != 0)
     // An interesting aspect of this is that increment is always true.
     // Decrement does not have this property.
-    if (kind == cir::UnaryOpKind::Inc && type->isBooleanType()) {
+    if (e->isIncrementOp() && type->isBooleanType()) {
       value = builder.getTrue(cgf.getLoc(e->getExprLoc()));
     } else if (type->isIntegerType()) {
       QualType promotedType;
@@ -663,12 +662,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
       assert(!cir::MissingFeatures::sanitizers());
       if (e->canOverflow() && type->isSignedIntegerOrEnumerationType()) {
-        value = emitIncDecConsiderOverflowBehavior(e, value, kind);
+        value = emitIncDecConsiderOverflowBehavior(e, value);
       } else {
-        cir::UnaryOpKind kind =
-            e->isIncrementOp() ? cir::UnaryOpKind::Inc : cir::UnaryOpKind::Dec;
         // NOTE(CIR): clang calls CreateAdd but folds this to a unary op
-        value = emitUnaryOp(e, kind, input, /*nsw=*/false);
+        value = emitIncOrDec(e, input, /*nsw=*/false);
       }
     } else if (const PointerType *ptr = type->getAs<PointerType>()) {
       QualType type = ptr->getPointeeType();
@@ -680,7 +677,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
         // For everything else, we can just do a simple increment.
         mlir::Location loc = cgf.getLoc(e->getSourceRange());
         CIRGenBuilderTy &builder = cgf.getBuilder();
-        int amount = kind == cir::UnaryOpKind::Inc ? 1 : -1;
+        int amount = e->isIncrementOp() ? 1 : -1;
         mlir::Value amt = builder.getSInt32(amount, loc);
         assert(!cir::MissingFeatures::sanitizers());
         value = builder.createPtrStride(loc, value, amt);
@@ -700,10 +697,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
       if (mlir::isa<cir::SingleType, cir::DoubleType>(value.getType())) {
         // Create the inc/dec operation.
         // NOTE(CIR): clang calls CreateAdd but folds this to a unary op
-        assert(
-            (kind == cir::UnaryOpKind::Inc || kind == cir::UnaryOpKind::Dec) &&
-            "Invalid UnaryOp kind");
-        value = emitUnaryOp(e, kind, value);
+        value = emitIncOrDec(e, value);
       } else {
         cgf.cgm.errorNYI(e->getSourceRange(), "Unary inc/dec other fp type");
         return {};
@@ -728,23 +722,20 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
     // If this is a postinc, return the value read from memory, otherwise use
     // the updated value.
-    return isPre ? value : input;
+    return e->isPrefix() ? value : input;
   }
 
   mlir::Value emitIncDecConsiderOverflowBehavior(const UnaryOperator *e,
-                                                 mlir::Value inVal,
-                                                 cir::UnaryOpKind kind) {
-    assert((kind == cir::UnaryOpKind::Inc || kind == cir::UnaryOpKind::Dec) &&
-           "Invalid UnaryOp kind");
+                                                 mlir::Value inVal) {
     switch (cgf.getLangOpts().getSignedOverflowBehavior()) {
     case LangOptions::SOB_Defined:
-      return emitUnaryOp(e, kind, inVal, /*nsw=*/false);
+      return emitIncOrDec(e, inVal, /*nsw=*/false);
     case LangOptions::SOB_Undefined:
       assert(!cir::MissingFeatures::sanitizers());
-      return emitUnaryOp(e, kind, inVal, /*nsw=*/true);
+      return emitIncOrDec(e, inVal, /*nsw=*/true);
     case LangOptions::SOB_Trapping:
       if (!e->canOverflow())
-        return emitUnaryOp(e, kind, inVal, /*nsw=*/true);
+        return emitIncOrDec(e, inVal, /*nsw=*/true);
       cgf.cgm.errorNYI(e->getSourceRange(), "inc/def overflow SOB_Trapping");
       return {};
     }
@@ -766,25 +757,28 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitUnaryPlus(const UnaryOperator *e) {
     QualType promotionType = getPromotionType(e->getSubExpr()->getType());
-    mlir::Value result =
-        emitUnaryPlusOrMinus(e, cir::UnaryOpKind::Plus, promotionType);
+    mlir::Value result = VisitUnaryPlus(e, promotionType);
     if (result && !promotionType.isNull())
       return emitUnPromotedValue(result, e->getType());
     return result;
   }
 
+  mlir::Value VisitUnaryPlus(const UnaryOperator *e, QualType promotionType) {
+    ignoreResultAssign = false;
+    if (!promotionType.isNull())
+      return cgf.emitPromotedScalarExpr(e->getSubExpr(), promotionType);
+    return Visit(e->getSubExpr());
+  }
+
   mlir::Value VisitUnaryMinus(const UnaryOperator *e) {
     QualType promotionType = getPromotionType(e->getSubExpr()->getType());
-    mlir::Value result =
-        emitUnaryPlusOrMinus(e, cir::UnaryOpKind::Minus, promotionType);
+    mlir::Value result = VisitUnaryMinus(e, promotionType);
     if (result && !promotionType.isNull())
       return emitUnPromotedValue(result, e->getType());
     return result;
   }
 
-  mlir::Value emitUnaryPlusOrMinus(const UnaryOperator *e,
-                                   cir::UnaryOpKind kind,
-                                   QualType promotionType) {
+  mlir::Value VisitUnaryMinus(const UnaryOperator *e, QualType promotionType) {
     ignoreResultAssign = false;
     mlir::Value operand;
     if (!promotionType.isNull())
@@ -795,27 +789,29 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     // TODO(cir): We might have to change this to support overflow trapping.
     //            Classic codegen routes unary minus through emitSub to ensure
     //            that the overflow behavior is handled correctly.
-    bool nsw = kind == cir::UnaryOpKind::Minus &&
-               e->getType()->isSignedIntegerType() &&
+    bool nsw = e->getType()->isSignedIntegerType() &&
                cgf.getLangOpts().getSignedOverflowBehavior() !=
                    LangOptions::SOB_Defined;
 
     // NOTE: LLVM codegen will lower this directly to either a FNeg
     // or a Sub instruction.  In CIR this will be handled later in LowerToLLVM.
-    return emitUnaryOp(e, kind, operand, nsw);
+    return builder.createOrFold<cir::MinusOp>(
+        cgf.getLoc(e->getSourceRange().getBegin()), operand, nsw);
   }
 
-  mlir::Value emitUnaryOp(const UnaryOperator *e, cir::UnaryOpKind kind,
-                          mlir::Value input, bool nsw = false) {
-    return builder.createOrFold<cir::UnaryOp>(
-        cgf.getLoc(e->getSourceRange().getBegin()), input.getType(), kind,
-        input, nsw);
+  mlir::Value emitIncOrDec(const UnaryOperator *e, mlir::Value input,
+                           bool nsw = false) {
+    mlir::Location loc = cgf.getLoc(e->getSourceRange().getBegin());
+    return e->isIncrementOp()
+               ? builder.createOrFold<cir::IncOp>(loc, input, nsw)
+               : builder.createOrFold<cir::DecOp>(loc, input, nsw);
   }
 
   mlir::Value VisitUnaryNot(const UnaryOperator *e) {
     ignoreResultAssign = false;
     mlir::Value op = Visit(e->getSubExpr());
-    return emitUnaryOp(e, cir::UnaryOpKind::Not, op);
+    return builder.createOrFold<cir::NotOp>(
+        cgf.getLoc(e->getSourceRange().getBegin()), op);
   }
 
   mlir::Value VisitUnaryLNot(const UnaryOperator *e);
@@ -1566,9 +1562,9 @@ mlir::Value ScalarExprEmitter::emitPromoted(const Expr *e,
     case UO_Real:
       return VisitRealImag(uo, promotionType);
     case UO_Minus:
-      return emitUnaryPlusOrMinus(uo, cir::UnaryOpKind::Minus, promotionType);
+      return VisitUnaryMinus(uo, promotionType);
     case UO_Plus:
-      return emitUnaryPlusOrMinus(uo, cir::UnaryOpKind::Plus, promotionType);
+      return VisitUnaryPlus(uo, promotionType);
     default:
       break;
     }
@@ -2819,9 +2815,6 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
 }
 
 mlir::Value CIRGenFunction::emitScalarPrePostIncDec(const UnaryOperator *e,
-                                                    LValue lv,
-                                                    cir::UnaryOpKind kind,
-                                                    bool isPre) {
-  return ScalarExprEmitter(*this, builder)
-      .emitScalarPrePostIncDec(e, lv, kind, isPre);
+                                                    LValue lv) {
+  return ScalarExprEmitter(*this, builder).emitScalarPrePostIncDec(e, lv);
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index 1b3518616d6b7..a3eb000871013 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -1667,8 +1667,7 @@ class CIRGenFunction : public CIRGenTypeCache {
   mlir::Value emitScalarExpr(const clang::Expr *e,
                              bool ignoreResultAssign = false);
 
-  mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv,
-                                      cir::UnaryOpKind kind, bool isPre);
+  mlir::Value emitScalarPrePostIncDec(const UnaryOperator *e, LValue lv);
 
   /// Build a debug stoppoint if we are emitting debug info.
   void emitStopPoint(const Stmt *s);
@@ -1699,8 +1698,7 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   void emitComplexExprIntoLValue(const Expr *e, LValue dest, bool isInit);
 
-  mlir::Value emitComplexPrePostIncDec(const UnaryOperator *e, LValue lv,
-                                       cir::UnaryOpKind op, bool isPre);
+  mlir::Value emitComplexPrePostIncDec(const UnaryOperator *e, LValue lv);
 
   LValue emitComplexAssignmentLValue(const BinaryOperator *e);
   LValue emitComplexCompoundAssignmentLValue(const CompoundAssignOperator *e);
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
index 44f6bc845ec8f..eab045e186699 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCRecipe.cpp
@@ -92,8 +92,7 @@ void OpenACCRecipeBuilderBase::makeAllocaCopy(mlir::Location loc,
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           // Simple increment of the iterator.
           auto load = cir::LoadOp::create(builder, loc, {itr});
-          auto inc = cir::UnaryOp::create(builder, loc, load.getType(),
-                                          cir::UnaryOpKind::Inc, load);
+          auto inc = builder.createInc(loc, load);
           builder.CIRBaseBuilderTy::createStore(loc, inc, itr);
           builder.createYield(loc);
         });
@@ -308,9 +307,8 @@ std::pair<mlir::Value, mlir::Value> OpenACCRecipeBuilderBase::createBoundsLoop(
         /*stepBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto load = cir::LoadOp::create(builder, loc, {itr});
-          auto unary = cir::UnaryOp::create(
-              builder, loc, load.getType(),
-              inverse ? cir::UnaryOpKind::Dec : cir::UnaryOpKind::Inc, load);
+          auto unary = inverse ? builder.createDec(loc, load)
+                               : builder.createInc(loc, load);
           builder.CIRBaseBuilderTy::createStore(loc, unary, itr);
           builder.createYield(loc);
         });
@@ -654,8 +652,7 @@ void OpenACCRecipeBuilderBase::createReductionRecipeCombiner(
         /*stepBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           auto loadItr = cir::LoadOp::create(builder, loc, {itr});
-          auto inc = cir::UnaryOp::create(builder, loc, loadItr.getType(),
-                                          cir::UnaryOpKind::Inc, loadItr);
+          auto inc = builder.createInc(loc, loadItr);
           builder.CIRBaseBuilderTy::createStore(loc, inc, itr);
           builder.createYield(loc);
         }));
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index 8d2990af5de8c..0b63c2b7450fb 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -2664,113 +2664,74 @@ LogicalResult cir::LabelOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// UnaryOp
+// IncOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult cir::UnaryOp::verify() {
-  switch (getKind()) {
-  case cir::UnaryOpKind::Inc:
-  case cir::UnaryOpKind::Dec:
-  case cir::UnaryOpKind::Plus:
-  case cir::UnaryOpKind::Minus:
-  case cir::UnaryOpKind::Not:
-    // Nothing to verify.
-    return success();
-  }
-
-  llvm_unreachable("Unknown UnaryOp kind?");
+OpFoldResult cir::IncOp::fold(FoldAdaptor adaptor) {
+  if (mlir::isa_and_present<cir::PoisonAttr>(adaptor.getInput()))
+    return adaptor.getInput();
+  return {};
 }
 
-static bool isBoolNot(cir::UnaryOp op) {
-  return isa<cir::BoolType>(op.getInput().getType()) &&
-         op.getKind() == cir::UnaryOpKind::Not;
+//===----------------------------------------------------------------------===//
+// DecOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult cir::DecOp::fold(FoldAdaptor adaptor) {
+  if (mlir::isa_and_present<cir::PoisonAttr>(adaptor.getInput()))
+    return adaptor.getInput();
+  return {};
 }
 
-// This folder simplifies the sequential boolean not operations.
-// For instance, the next two unary operations will be eliminated:
-//
-// ```mlir
-// %1 = cir.unary(not, %0) : !cir.bool, !cir.bool
-// %2 = cir.unary(not, %1) : !cir.bool, !cir.bool
-// ```
-//
-// and the argument of the first one (%0) will be used instead.
-OpFoldResult cir::UnaryOp::fold(FoldAdaptor adaptor) {
-  if (auto poison =
-          mlir::dyn_cast_if_present<cir::PoisonAttr>(adaptor.getInput())) {
-    // Propagate poison values
-    return poison;
-  }
-
-  if (isBoolNot(*this))
-    if (auto previous = getInput().getDefiningOp<cir::UnaryOp>())
-      if (isBoolNot(previous))
-        return previous.getInput();
-
-  // Avoid introducing unnecessary duplicate constants in cases where we are
-  // just folding the operation to its input value. If we return the
-  // input attribute from the adapter, a new constant is materialized, but
-  // if we return the input value directly, it avoids that.
-  if (auto srcConst = getInput().getDefiningOp<cir::ConstantOp>()) {
-    if (getKind() == cir::UnaryOpKind::Plus ||
-        (mlir::isa<cir::BoolType>(srcConst.getType()) &&
-         getKind() == cir::UnaryOpKind::Minus))
+//===----------------------------------------------------------------------===//
+// MinusOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult cir::MinusOp::fold(FoldAdaptor adaptor) {
+  if (mlir::isa_and_present<cir::PoisonAttr>(adaptor.getInput()))
+    return adaptor.getInput();
+
+  // Avoid materializing a duplicate constant for bool minus (identity).
+  if (auto srcConst = getInput().getDefiningOp<cir::ConstantOp>())
+    if (mlir::isa<cir::BoolType>(srcConst.getType()))
       return srcConst.getResult();
+
+  // Fold with constant inputs.
+  if (mlir::Attribute attr = adaptor.getInput()) {
+    if (auto intAttr = mlir::dyn_cast<cir::IntAttr>(attr)) {
+      APInt val = intAttr.getValue();
+      val.negate();
+      return cir::IntAttr::get(getType(), val);
+    }
+    if (auto fpAttr = mlir::dyn_cast<cir::FPAttr>(attr)) {
+      APFloat val = fpAttr.getValue();
+      val.changeSign();
+      return cir::FPAttr::get(getType(), val);
+    }
   }
 
-  // Fold unary operations with constant inputs. If the input is a ConstantOp,
-  // it "folds" to its value attribute. If it was some other operation that
-  // was folded, it will be an mlir::Attribute that hasn't yet been
-  // materialized. If it was a value that couldn't be folded, it will be null.
+  return {};
+}
+
+//===----------------------------------------------------------------------===//
+// NotOp
+//===----------------------------------------------------------------------===//
+
+OpFoldResult cir::NotOp::fold(FoldAdaptor adaptor) {
+  if (mlir::isa_and_present<cir::PoisonAttr>(adaptor.getInput()))
+    return adaptor.getInput();
+
+  // not(not(x)) -> x is handled by the Involution trait.
+
+  // Fold with constant inputs.
   if (mlir::Attribute attr = adaptor.getInput()) {
-    // For now, we only attempt to fold simple scalar values.
-    OpFoldResult result =
-        llvm::TypeSwitch<mlir::Attribute, OpFoldResult>(attr)
-            .Case<cir::IntAttr>([&](cir::IntAttr attrT) {
-              switch (getKind()) {
-              case cir::UnaryOpKind::Not: {
-                APInt val = attrT.getValue();
-                val.flipAllBits();
-                return cir::IntAttr::get(getType(), val);
-              }
-              case cir::UnaryOpKind::Plus:
-                return attrT;
-              case cir::UnaryOpKind::Minus: {
-                APInt val = attrT.getValue();
-                val.negate();
-                return cir::IntAttr::get(getType(), val);
-              }
-              default:
-                return cir::IntAttr{};
-              }
-            })
-            .Case<cir::FPAttr>([&](cir::FPAttr attrT) {
-              switch (getKind()) {
-              case cir::UnaryOpKind::Plus:
-                return attrT;
-              case cir::UnaryOpKind::Minus: {
-                APFloat val = attrT.getValue();
-                val.changeSign();
-                return cir::FPAttr::get(getType(), val);
-              }
-              default:
-                return cir::FPAttr{};
-              }
-            })
-            .Case<cir::BoolAttr>([&](cir::BoolAttr attrT) {
-              switch (getKind()) {
-              case cir::UnaryOpKind::Not:
-                return cir::BoolAttr::get(getContext(), !attrT.getValue());
-              case cir::UnaryOpKind::Plus:
-              case cir::UnaryOpKind::Minus:
-                return attrT;
-              default:
-                return cir::BoolAttr{};
-              }
-            })
-            .Default([&](auto attrT) { return mlir::Attribute{}; });
-    if (result)
-      return result;
+    if (auto intAttr = mlir::dyn_cast<cir::IntAttr>(attr)) {
+      APInt val = intAttr.getValue();
+      val.flipAllBits();
+      return cir::IntAttr::get(getType(), val);
+    }
+    if (auto boolAttr = mlir::dyn_cast<cir::BoolAttr>(attr))
+      return cir::BoolAttr::get(getContext(), !boolAttr.getValue());
   }
 
   return {};
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
index e9b825606a04e..333517956fe9c 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRCanonicalize.cpp
@@ -70,7 +70,8 @@ void CIRCanonicalizePass::runOnOperation() {
 
     // Many operations are here to perform a manual `fold` in
     // applyOpPatternsGreedily.
-    if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, UnaryOp,
+    if (isa<BrOp, BrCondOp, CastOp, ScopeOp, SwitchOp, SelectOp, IncOp, DecOp,
+            MinusOp, NotOp, AddOp, MulOp, AndOp, OrOp, XorOp, MaxOp, MinOp,
             ComplexCreateOp, ComplexImagOp, ComplexRealOp, VecCmpOp,
             VecCreateOp, VecExtractOp, VecShuffleOp, VecShuffleDynamicOp,
             VecTernaryOp, BitClrsbOp, BitClzOp, BitCtzOp, BitFfsOp, BitParityOp,
diff --git a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
index dcef9ddee1bb4..45cf41052089c 100644
--- a/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/CIRSimplify.cpp
@@ -125,7 +125,7 @@ struct SimplifyTernary final : public OpRewritePattern<TernaryOp> {
 ///
 ///    %0 = cir.select if %condition then false else true
 ///    ->
-///    %0 = cir.unary not %condition
+///    %0 = cir.not %condition
 struct SimplifySelect : public OpRewritePattern<SelectOp> {
   using OpRewritePattern<SelectOp>::OpRewritePattern;
 
@@ -148,10 +148,9 @@ struct SimplifySelect : public OpRewritePattern<SelectOp> {
       return mlir::success();
     }
 
-    // cir.select if %0 then #false else #true -> cir.unary not %0
+    // cir.select if %0 then #false else #true -> cir.not %0
     if (!trueValue.getValue() && falseValue.getValue()) {
-      rewriter.replaceOpWithNewOp<cir::UnaryOp>(op, cir::UnaryOpKind::Not,
-                                                op.getCondition());
+      rewriter.replaceOpWithNewOp<cir::NotOp>(op, op.getCondition());
       return mlir::success();
     }
 
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index 82bf8dbccba97..dfab0cd7f89c8 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -22,6 +22,7 @@
 #include "clang/CIR/Dialect/Passes.h"
 #include "clang/CIR/Interfaces/ASTAttrInterfaces.h"
 #include "clang/CIR/MissingFeatures.h"
+#include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Path.h"
 
 #include <memory>
@@ -73,7 +74,7 @@ struct LoweringPreparePass
   void lowerCastOp(cir::CastOp op);
   void lowerComplexDivOp(cir::ComplexDivOp op);
   void lowerComplexMulOp(cir::ComplexMulOp op);
-  void lowerUnaryOp(cir::UnaryOp op);
+  void lowerUnaryOp(cir::UnaryOpInterface op);
   void lowerGlobalOp(cir::GlobalOp op);
   void lowerArrayDtor(cir::ArrayDtor op);
   void lowerArrayCtor(cir::ArrayCtor op);
@@ -797,14 +798,11 @@ void LoweringPreparePass::lowerComplexMulOp(cir::ComplexMulOp op) {
   op.erase();
 }
 
-void LoweringPreparePass::lowerUnaryOp(cir::UnaryOp op) {
-  mlir::Type ty = op.getType();
-  if (!mlir::isa<cir::ComplexType>(ty))
+void LoweringPreparePass::lowerUnaryOp(cir::UnaryOpInterface op) {
+  if (!mlir::isa<cir::ComplexType>(op.getResult().getType()))
     return;
 
-  mlir::Location loc = op.getLoc();
-  cir::UnaryOpKind opKind = op.getKind();
-
+  mlir::Location loc = op->getLoc();
   CIRBaseBuilderTy builder(getContext());
   builder.setInsertionPointAfter(op);
 
@@ -812,32 +810,25 @@ void LoweringPreparePass::lowerUnaryOp(cir::UnaryOp op) {
   mlir::Value operandReal = builder.createComplexReal(loc, operand);
   mlir::Value operandImag = builder.createComplexImag(loc, operand);
 
-  mlir::Value resultReal;
-  mlir::Value resultImag;
-
-  switch (opKind) {
-  case cir::UnaryOpKind::Inc:
-  case cir::UnaryOpKind::Dec:
-    resultReal = builder.createUnaryOp(loc, opKind, operandReal);
-    resultImag = operandImag;
-    break;
-
-  case cir::UnaryOpKind::Plus:
-  case cir::UnaryOpKind::Minus:
-    resultReal = builder.createUnaryOp(loc, opKind, operandReal);
-    resultImag = builder.createUnaryOp(loc, opKind, operandImag);
-    break;
-
-  case cir::UnaryOpKind::Not:
-    resultReal = operandReal;
-    resultImag =
-        builder.createUnaryOp(loc, cir::UnaryOpKind::Minus, operandImag);
-    break;
-  }
+  mlir::Value resultReal = operandReal;
+  mlir::Value resultImag = operandImag;
+
+  llvm::TypeSwitch<mlir::Operation *>(op)
+      .Case<cir::IncOp>(
+          [&](auto) { resultReal = builder.createInc(loc, operandReal); })
+      .Case<cir::DecOp>(
+          [&](auto) { resultReal = builder.createDec(loc, operandReal); })
+      .Case<cir::MinusOp>([&](auto) {
+        resultReal = builder.createMinus(loc, operandReal);
+        resultImag = builder.createMinus(loc, operandImag);
+      })
+      .Case<cir::NotOp>(
+          [&](auto) { resultImag = builder.createMinus(loc, operandImag); })
+      .Default([](auto) { llvm_unreachable("unhandled unary complex op"); });
 
   mlir::Value result = builder.createComplexCreate(loc, resultReal, resultImag);
-  op.replaceAllUsesWith(result);
-  op.erase();
+  op->replaceAllUsesWith(mlir::ValueRange{result});
+  op->erase();
 }
 
 cir::FuncOp LoweringPreparePass::getOrCreateDtorFunc(CIRBaseBuilderTy &builder,
@@ -1585,8 +1576,8 @@ void LoweringPreparePass::runOnOp(mlir::Operation *op) {
           !globalOp.getCtorRegion().empty())
         handleStaticLocal(globalOp, getGlobal);
     }
-  } else if (auto unary = mlir::dyn_cast<cir::UnaryOp>(op)) {
-    lowerUnaryOp(unary);
+  } else if (auto unaryOp = mlir::dyn_cast<cir::UnaryOpInterface>(op)) {
+    lowerUnaryOp(unaryOp);
   } else if (auto callOp = dyn_cast<cir::CallOp>(op)) {
     lowerTrivialCopyCall(callOp);
   } else if (auto storeOp = dyn_cast<cir::StoreOp>(op)) {
@@ -1610,7 +1601,8 @@ void LoweringPreparePass::runOnOperation() {
     if (mlir::isa<cir::ArrayCtor, cir::ArrayDtor, cir::CastOp,
                   cir::ComplexMulOp, cir::ComplexDivOp, cir::DynamicCastOp,
                   cir::FuncOp, cir::CallOp, cir::GetGlobalOp, cir::GlobalOp,
-                  cir::StoreOp, cir::UnaryOp>(op))
+                  cir::StoreOp, cir::IncOp, cir::DecOp, cir::MinusOp,
+                  cir::NotOp>(op))
       opsToTransform.push_back(op);
   });
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 9c68248d5dede..4a1b4292b23dd 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -2731,140 +2731,106 @@ mlir::LogicalResult CIRToLLVMSwitchFlatOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
-mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite(
-    cir::UnaryOp op, OpAdaptor adaptor,
-    mlir::ConversionPatternRewriter &rewriter) const {
-  assert(op.getType() == op.getInput().getType() &&
-         "Unary operation's operand type and result type are different");
-  mlir::Type type = op.getType();
-  mlir::Type elementType = elementTypeIfVector(type);
-  bool isVector = mlir::isa<cir::VectorType>(type);
-  mlir::Type llvmType = getTypeConverter()->convertType(type);
+static mlir::LLVM::IntegerOverflowFlags nswFlag(bool nsw) {
+  return nsw ? mlir::LLVM::IntegerOverflowFlags::nsw
+             : mlir::LLVM::IntegerOverflowFlags::none;
+}
+
+template <typename CIROp, typename LLVMIntOp>
+static mlir::LogicalResult
+lowerIncDecOp(CIROp op, typename CIROp::Adaptor adaptor,
+              mlir::ConversionPatternRewriter &rewriter, double fpConstant) {
+  mlir::Type elementType = elementTypeIfVector(op.getType());
+  mlir::Type llvmType = adaptor.getInput().getType();
   mlir::Location loc = op.getLoc();
 
-  // Integer unary operations: + - ~ ++ --
   if (mlir::isa<cir::IntType>(elementType)) {
-    mlir::LLVM::IntegerOverflowFlags maybeNSW =
-        op.getNoSignedWrap() ? mlir::LLVM::IntegerOverflowFlags::nsw
-                             : mlir::LLVM::IntegerOverflowFlags::none;
-    switch (op.getKind()) {
-    case cir::UnaryOpKind::Inc: {
-      assert(!isVector && "++ not allowed on vector types");
-      auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
-      rewriter.replaceOpWithNewOp<mlir::LLVM::AddOp>(
-          op, llvmType, adaptor.getInput(), one, maybeNSW);
-      return mlir::success();
-    }
-    case cir::UnaryOpKind::Dec: {
-      assert(!isVector && "-- not allowed on vector types");
-      auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
-      rewriter.replaceOpWithNewOp<mlir::LLVM::SubOp>(op, adaptor.getInput(),
-                                                     one, maybeNSW);
-      return mlir::success();
-    }
-    case cir::UnaryOpKind::Plus:
-      rewriter.replaceOp(op, adaptor.getInput());
-      return mlir::success();
-    case cir::UnaryOpKind::Minus: {
-      mlir::Value zero;
-      if (isVector)
-        zero = mlir::LLVM::ZeroOp::create(rewriter, loc, llvmType);
-      else
-        zero = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 0);
-      rewriter.replaceOpWithNewOp<mlir::LLVM::SubOp>(
-          op, zero, adaptor.getInput(), maybeNSW);
-      return mlir::success();
-    }
-    case cir::UnaryOpKind::Not: {
-      // bit-wise compliment operator, implemented as an XOR with -1.
-      mlir::Value minusOne;
-      if (isVector) {
-        const uint64_t numElements =
-            mlir::dyn_cast<cir::VectorType>(type).getSize();
-        std::vector<int32_t> values(numElements, -1);
-        mlir::DenseIntElementsAttr denseVec = rewriter.getI32VectorAttr(values);
-        minusOne =
-            mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, denseVec);
-      } else {
-        minusOne = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, -1);
-      }
-      rewriter.replaceOpWithNewOp<mlir::LLVM::XOrOp>(op, adaptor.getInput(),
-                                                     minusOne);
-      return mlir::success();
-    }
-    }
-    llvm_unreachable("Unexpected unary op for int");
+    auto maybeNSW = nswFlag(op.getNoSignedWrap());
+    auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
+    rewriter.replaceOpWithNewOp<LLVMIntOp>(op, adaptor.getInput(), one,
+                                           maybeNSW);
+    return mlir::success();
   }
-
-  // Floating point unary operations: + - ++ --
   if (mlir::isa<cir::FPTypeInterface>(elementType)) {
-    switch (op.getKind()) {
-    case cir::UnaryOpKind::Inc: {
-      assert(!isVector && "++ not allowed on vector types");
-      mlir::LLVM::ConstantOp one = mlir::LLVM::ConstantOp::create(
-          rewriter, loc, llvmType, rewriter.getFloatAttr(llvmType, 1.0));
-      rewriter.replaceOpWithNewOp<mlir::LLVM::FAddOp>(op, llvmType, one,
-                                                      adaptor.getInput());
-      return mlir::success();
-    }
-    case cir::UnaryOpKind::Dec: {
-      assert(!isVector && "-- not allowed on vector types");
-      mlir::LLVM::ConstantOp minusOne = mlir::LLVM::ConstantOp::create(
-          rewriter, loc, llvmType, rewriter.getFloatAttr(llvmType, -1.0));
-      rewriter.replaceOpWithNewOp<mlir::LLVM::FAddOp>(op, llvmType, minusOne,
-                                                      adaptor.getInput());
-      return mlir::success();
-    }
-    case cir::UnaryOpKind::Plus:
-      rewriter.replaceOp(op, adaptor.getInput());
-      return mlir::success();
-    case cir::UnaryOpKind::Minus:
-      rewriter.replaceOpWithNewOp<mlir::LLVM::FNegOp>(op, llvmType,
-                                                      adaptor.getInput());
-      return mlir::success();
-    case cir::UnaryOpKind::Not:
-      return op.emitError() << "Unary not is invalid for floating-point types";
-    }
-    llvm_unreachable("Unexpected unary op for float");
+    auto fpConst = mlir::LLVM::ConstantOp::create(
+        rewriter, loc, rewriter.getFloatAttr(llvmType, fpConstant));
+    rewriter.replaceOpWithNewOp<mlir::LLVM::FAddOp>(op, fpConst,
+                                                    adaptor.getInput());
+    return mlir::success();
   }
+  return op.emitError() << "Unsupported type for IncOp/DecOp";
+}
 
-  // Boolean unary operations: ! only. (For all others, the operand has
-  // already been promoted to int.)
-  if (mlir::isa<cir::BoolType>(elementType)) {
-    switch (op.getKind()) {
-    case cir::UnaryOpKind::Inc:
-    case cir::UnaryOpKind::Dec:
-    case cir::UnaryOpKind::Plus:
-    case cir::UnaryOpKind::Minus:
-      // Some of these are allowed in source code, but we shouldn't get here
-      // with a boolean type.
-      return op.emitError() << "Unsupported unary operation on boolean type";
-    case cir::UnaryOpKind::Not: {
-      assert(!isVector && "NYI: op! on vector mask");
-      auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
-      rewriter.replaceOpWithNewOp<mlir::LLVM::XOrOp>(op, adaptor.getInput(),
-                                                     one);
-      return mlir::success();
-    }
-    }
-    llvm_unreachable("Unexpected unary op for bool");
+mlir::LogicalResult CIRToLLVMIncOpLowering::matchAndRewrite(
+    cir::IncOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  return lowerIncDecOp<cir::IncOp, mlir::LLVM::AddOp>(op, adaptor, rewriter,
+                                                      1.0);
+}
+
+mlir::LogicalResult CIRToLLVMDecOpLowering::matchAndRewrite(
+    cir::DecOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  return lowerIncDecOp<cir::DecOp, mlir::LLVM::SubOp>(op, adaptor, rewriter,
+                                                      -1.0);
+}
+
+mlir::LogicalResult CIRToLLVMMinusOpLowering::matchAndRewrite(
+    cir::MinusOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type elementType = elementTypeIfVector(op.getType());
+  bool isVector = mlir::isa<cir::VectorType>(op.getType());
+  mlir::Type llvmType = adaptor.getInput().getType();
+  mlir::Location loc = op.getLoc();
+
+  if (mlir::isa<cir::IntType>(elementType)) {
+    auto maybeNSW = nswFlag(op.getNoSignedWrap());
+    mlir::Value zero;
+    if (isVector)
+      zero = mlir::LLVM::ZeroOp::create(rewriter, loc, llvmType);
+    else
+      zero = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 0);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::SubOp>(op, zero, adaptor.getInput(),
+                                                   maybeNSW);
+    return mlir::success();
+  }
+  if (mlir::isa<cir::FPTypeInterface>(elementType)) {
+    rewriter.replaceOpWithNewOp<mlir::LLVM::FNegOp>(op, adaptor.getInput());
+    return mlir::success();
   }
+  return op.emitError() << "Unsupported type for unary minus";
+}
 
-  // Pointer unary operations: + only.  (++ and -- of pointers are implemented
-  // with cir.ptr_stride, not cir.unary.)
-  if (mlir::isa<cir::PointerType>(elementType)) {
-    switch (op.getKind()) {
-    case cir::UnaryOpKind::Plus:
-      rewriter.replaceOp(op, adaptor.getInput());
-      return mlir::success();
-    default:
-      op.emitError() << "Unknown pointer unary operation during CIR lowering";
-      return mlir::failure();
+mlir::LogicalResult CIRToLLVMNotOpLowering::matchAndRewrite(
+    cir::NotOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  mlir::Type elementType = elementTypeIfVector(op.getType());
+  bool isVector = mlir::isa<cir::VectorType>(op.getType());
+  mlir::Type llvmType = adaptor.getInput().getType();
+  mlir::Location loc = op.getLoc();
+
+  if (mlir::isa<cir::IntType>(elementType)) {
+    mlir::Value minusOne;
+    if (isVector) {
+      const uint64_t numElements =
+          mlir::dyn_cast<cir::VectorType>(op.getType()).getSize();
+      SmallVector<int32_t> values(numElements, -1);
+      mlir::DenseIntElementsAttr denseVec = rewriter.getI32VectorAttr(values);
+      minusOne =
+          mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, denseVec);
+    } else {
+      minusOne = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, -1);
     }
+    rewriter.replaceOpWithNewOp<mlir::LLVM::XOrOp>(op, adaptor.getInput(),
+                                                   minusOne);
+    return mlir::success();
   }
-
-  return op.emitError() << "Unary operation has unsupported type: "
-                        << elementType;
+  if (mlir::isa<cir::BoolType>(elementType)) {
+    auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::XOrOp>(op, adaptor.getInput(), one);
+    return mlir::success();
+  }
+  return op.emitError() << "Unsupported type for bitwise NOT";
 }
 
 static bool isIntTypeUnsigned(mlir::Type type) {
@@ -2998,18 +2964,33 @@ mlir::LogicalResult CIRToLLVMXorOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
-mlir::LogicalResult CIRToLLVMMaxOpLowering::matchAndRewrite(
-    cir::MaxOp op, OpAdaptor adaptor,
-    mlir::ConversionPatternRewriter &rewriter) const {
+template <typename CIROp, typename UIntOp, typename SIntOp>
+static mlir::LogicalResult
+lowerMinMaxOp(CIROp op, typename CIROp::Adaptor adaptor,
+              mlir::ConversionPatternRewriter &rewriter) {
   const mlir::Value lhs = adaptor.getLhs();
   const mlir::Value rhs = adaptor.getRhs();
   if (isIntTypeUnsigned(elementTypeIfVector(op.getRhs().getType())))
-    rewriter.replaceOpWithNewOp<mlir::LLVM::UMaxOp>(op, lhs, rhs);
+    rewriter.replaceOpWithNewOp<UIntOp>(op, lhs, rhs);
   else
-    rewriter.replaceOpWithNewOp<mlir::LLVM::SMaxOp>(op, lhs, rhs);
+    rewriter.replaceOpWithNewOp<SIntOp>(op, lhs, rhs);
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMMaxOpLowering::matchAndRewrite(
+    cir::MaxOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  return lowerMinMaxOp<cir::MaxOp, mlir::LLVM::UMaxOp, mlir::LLVM::SMaxOp>(
+      op, adaptor, rewriter);
+}
+
+mlir::LogicalResult CIRToLLVMMinOpLowering::matchAndRewrite(
+    cir::MinOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  return lowerMinMaxOp<cir::MinOp, mlir::LLVM::UMinOp, mlir::LLVM::SMinOp>(
+      op, adaptor, rewriter);
+}
+
 /// Convert from a CIR comparison kind to an LLVM IR integral comparison kind.
 static mlir::LLVM::ICmpPredicate
 convertCmpKindToICmpPredicate(cir::CmpOpKind kind, bool isSigned) {
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 4b4eaadcf1527..5b8b4083c2ac0 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -241,6 +241,7 @@ getSancovOptsFromCGOpts(const CodeGenOptions &CGOpts) {
   Opts.TraceGep = CGOpts.SanitizeCoverageTraceGep;
   Opts.Use8bitCounters = CGOpts.SanitizeCoverage8bitCounters;
   Opts.TracePC = CGOpts.SanitizeCoverageTracePC;
+  Opts.TracePCEntryExit = CGOpts.SanitizeCoverageTracePCEntryExit;
   Opts.TracePCGuard = CGOpts.SanitizeCoverageTracePCGuard;
   Opts.NoPrune = CGOpts.SanitizeCoverageNoPrune;
   Opts.Inline8bitCounters = CGOpts.SanitizeCoverageInline8bitCounters;
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 069846b854a87..88b2b6b3c33fb 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -3327,15 +3327,18 @@ static Address emitDeclTargetVarDeclLValue(CodeGenFunction &CGF,
   std::optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
       OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD);
   // Return an invalid address if variable is MT_To (or MT_Enter starting with
-  // OpenMP 5.2) and unified memory is not enabled. For all other cases: MT_Link
-  // and MT_To (or MT_Enter) with unified memory, return a valid address.
+  // OpenMP 5.2, or MT_Local in OpenMP 6.0) and unified memory is not enabled.
+  // For all other cases: MT_Link and MT_To (or MT_Enter/MT_Local) with unified
+  // memory, return a valid address.
   if (!Res || ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-                *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+                *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+                *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
                !CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory()))
     return Address::invalid();
   assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
           ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-            *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+            *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+            *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
            CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) &&
          "Expected link clause OR to clause with unified memory enabled.");
   QualType PtrTy = CGF.getContext().getPointerType(VD->getType());
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 93ac0305df38f..82300c3ede183 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1201,6 +1201,12 @@ void CodeGenFunction::EmitNewArrayInitializer(
     EmitCXXAggrConstructorCall(Ctor, NumElements, CurPtr, CCE,
                                /*NewPointerIsChecked*/ true,
                                CCE->requiresZeroInitialization());
+    if (getContext().getTargetInfo().emitVectorDeletingDtors(
+            getContext().getLangOpts())) {
+      CXXDestructorDecl *Dtor = Ctor->getParent()->getDestructor();
+      if (Dtor && Dtor->isVirtual())
+        CGM.requireVectorDestructorDefinition(Ctor->getParent());
+    }
     return;
   }
 
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index 54f18064c8cbc..92129a041fd8e 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -39,9 +39,9 @@ static const ComplexType *getComplexType(QualType type) {
   }
 }
 
-namespace  {
+namespace {
 class ComplexExprEmitter
-  : public StmtVisitor<ComplexExprEmitter, ComplexPairTy> {
+    : public StmtVisitor<ComplexExprEmitter, ComplexPairTy> {
   CodeGenFunction &CGF;
   CGBuilderTy &Builder;
   bool IgnoreReal;
@@ -108,7 +108,9 @@ class ComplexExprEmitter
                            Result->getAggregateElement(1U));
     return Visit(E->getSubExpr());
   }
-  ComplexPairTy VisitParenExpr(ParenExpr *PE) { return Visit(PE->getSubExpr());}
+  ComplexPairTy VisitParenExpr(ParenExpr *PE) {
+    return Visit(PE->getSubExpr());
+  }
   ComplexPairTy VisitGenericSelectionExpr(GenericSelectionExpr *GE) {
     return Visit(GE->getResultExpr());
   }
@@ -185,15 +187,15 @@ class ComplexExprEmitter
     if (const auto *ECE = dyn_cast<ExplicitCastExpr>(E))
       CGF.CGM.EmitExplicitCastExprType(ECE, &CGF);
     if (E->changesVolatileQualification())
-       return EmitLoadOfLValue(E);
+      return EmitLoadOfLValue(E);
     return EmitCast(E->getCastKind(), E->getSubExpr(), E->getType());
   }
   ComplexPairTy VisitCallExpr(const CallExpr *E);
   ComplexPairTy VisitStmtExpr(const StmtExpr *E);
 
   // Operators.
-  ComplexPairTy VisitPrePostIncDec(const UnaryOperator *E,
-                                   bool isInc, bool isPre) {
+  ComplexPairTy VisitPrePostIncDec(const UnaryOperator *E, bool isInc,
+                                   bool isPre) {
     LValue LV = CGF.EmitLValue(E->getSubExpr());
     return CGF.EmitComplexPrePostIncDec(E, LV, isInc, isPre);
   }
@@ -217,7 +219,7 @@ class ComplexExprEmitter
   ComplexPairTy VisitUnaryMinus(const UnaryOperator *E,
                                 QualType PromotionType = QualType());
   ComplexPairTy VisitMinus(const UnaryOperator *E, QualType PromotionType);
-  ComplexPairTy VisitUnaryNot      (const UnaryOperator *E);
+  ComplexPairTy VisitUnaryNot(const UnaryOperator *E);
   // LNot,Real,Imag never return complex.
   ComplexPairTy VisitUnaryExtension(const UnaryOperator *E) {
     return Visit(E->getSubExpr());
@@ -247,15 +249,14 @@ class ComplexExprEmitter
   ComplexPairTy VisitImplicitValueInitExpr(ImplicitValueInitExpr *E) {
     assert(E->getType()->isAnyComplexType() && "Expected complex type!");
     QualType Elem = E->getType()->castAs<ComplexType>()->getElementType();
-    llvm::Constant *Null =
-                       llvm::Constant::getNullValue(CGF.ConvertType(Elem));
+    llvm::Constant *Null = llvm::Constant::getNullValue(CGF.ConvertType(Elem));
     return ComplexPairTy(Null, Null);
   }
 
   struct BinOpInfo {
     ComplexPairTy LHS;
     ComplexPairTy RHS;
-    QualType Ty;  // Computation Type.
+    QualType Ty; // Computation Type.
     FPOptions FPFeatures;
   };
 
@@ -263,13 +264,13 @@ class ComplexExprEmitter
                        QualType PromotionTy = QualType());
   ComplexPairTy EmitPromoted(const Expr *E, QualType PromotionTy);
   ComplexPairTy EmitPromotedComplexOperand(const Expr *E, QualType PromotionTy);
-  LValue EmitCompoundAssignLValue(const CompoundAssignOperator *E,
-                                  ComplexPairTy (ComplexExprEmitter::*Func)
-                                  (const BinOpInfo &),
-                                  RValue &Val);
-  ComplexPairTy EmitCompoundAssign(const CompoundAssignOperator *E,
-                                   ComplexPairTy (ComplexExprEmitter::*Func)
-                                   (const BinOpInfo &));
+  LValue EmitCompoundAssignLValue(
+      const CompoundAssignOperator *E,
+      ComplexPairTy (ComplexExprEmitter::*Func)(const BinOpInfo &),
+      RValue &Val);
+  ComplexPairTy EmitCompoundAssign(
+      const CompoundAssignOperator *E,
+      ComplexPairTy (ComplexExprEmitter::*Func)(const BinOpInfo &));
 
   ComplexPairTy EmitBinAdd(const BinOpInfo &Op);
   ComplexPairTy EmitBinSub(const BinOpInfo &Op);
@@ -381,11 +382,9 @@ class ComplexExprEmitter
 
   // No comparisons produce a complex result.
 
-  LValue EmitBinAssignLValue(const BinaryOperator *E,
-                             ComplexPairTy &Val);
-  ComplexPairTy VisitBinAssign     (const BinaryOperator *E);
-  ComplexPairTy VisitBinComma      (const BinaryOperator *E);
-
+  LValue EmitBinAssignLValue(const BinaryOperator *E, ComplexPairTy &Val);
+  ComplexPairTy VisitBinAssign(const BinaryOperator *E);
+  ComplexPairTy VisitBinComma(const BinaryOperator *E);
 
   ComplexPairTy
   VisitAbstractConditionalOperator(const AbstractConditionalOperator *CO);
@@ -407,7 +406,7 @@ class ComplexExprEmitter
     return Visit(E->getSelectedExpr());
   }
 };
-}  // end anonymous namespace.
+} // end anonymous namespace.
 
 //===----------------------------------------------------------------------===//
 //                                Utilities
@@ -469,8 +468,6 @@ void ComplexExprEmitter::EmitStoreOfComplex(ComplexPairTy Val, LValue lvalue,
   CGF.addInstToCurrentSourceAtom(I, Val.second);
 }
 
-
-
 //===----------------------------------------------------------------------===//
 //                            Visitor Methods
 //===----------------------------------------------------------------------===//
@@ -478,18 +475,17 @@ void ComplexExprEmitter::EmitStoreOfComplex(ComplexPairTy Val, LValue lvalue,
 ComplexPairTy ComplexExprEmitter::VisitExpr(Expr *E) {
   CGF.ErrorUnsupported(E, "complex expression");
   llvm::Type *EltTy =
-    CGF.ConvertType(getComplexType(E->getType())->getElementType());
+      CGF.ConvertType(getComplexType(E->getType())->getElementType());
   llvm::Value *U = llvm::PoisonValue::get(EltTy);
   return ComplexPairTy(U, U);
 }
 
-ComplexPairTy ComplexExprEmitter::
-VisitImaginaryLiteral(const ImaginaryLiteral *IL) {
+ComplexPairTy
+ComplexExprEmitter::VisitImaginaryLiteral(const ImaginaryLiteral *IL) {
   llvm::Value *Imag = CGF.EmitScalarExpr(IL->getSubExpr());
   return ComplexPairTy(llvm::Constant::getNullValue(Imag->getType()), Imag);
 }
 
-
 ComplexPairTy ComplexExprEmitter::VisitCallExpr(const CallExpr *E) {
   if (E->getCallReturnType(CGF.getContext())->isReferenceType())
     return EmitLoadOfLValue(E);
@@ -539,7 +535,8 @@ ComplexPairTy ComplexExprEmitter::EmitScalarToComplexCast(llvm::Value *Val,
 ComplexPairTy ComplexExprEmitter::EmitCast(CastKind CK, Expr *Op,
                                            QualType DestTy) {
   switch (CK) {
-  case CK_Dependent: llvm_unreachable("dependent cast kind in IR gen!");
+  case CK_Dependent:
+    llvm_unreachable("dependent cast kind in IR gen!");
 
   // Atomic to non-atomic casts may be more than a no-op for some platforms and
   // for some types.
@@ -691,10 +688,10 @@ ComplexPairTy ComplexExprEmitter::VisitMinus(const UnaryOperator *E,
 
   llvm::Value *ResR, *ResI;
   if (Op.first->getType()->isFloatingPointTy()) {
-    ResR = Builder.CreateFNeg(Op.first,  "neg.r");
+    ResR = Builder.CreateFNeg(Op.first, "neg.r");
     ResI = Builder.CreateFNeg(Op.second, "neg.i");
   } else {
-    ResR = Builder.CreateNeg(Op.first,  "neg.r");
+    ResR = Builder.CreateNeg(Op.first, "neg.r");
     ResI = Builder.CreateNeg(Op.second, "neg.i");
   }
   return ComplexPairTy(ResR, ResI);
@@ -719,14 +716,14 @@ ComplexPairTy ComplexExprEmitter::EmitBinAdd(const BinOpInfo &Op) {
 
   if (Op.LHS.first->getType()->isFloatingPointTy()) {
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(CGF, Op.FPFeatures);
-    ResR = Builder.CreateFAdd(Op.LHS.first,  Op.RHS.first,  "add.r");
+    ResR = Builder.CreateFAdd(Op.LHS.first, Op.RHS.first, "add.r");
     if (Op.LHS.second && Op.RHS.second)
       ResI = Builder.CreateFAdd(Op.LHS.second, Op.RHS.second, "add.i");
     else
       ResI = Op.LHS.second ? Op.LHS.second : Op.RHS.second;
     assert(ResI && "Only one operand may be real!");
   } else {
-    ResR = Builder.CreateAdd(Op.LHS.first,  Op.RHS.first,  "add.r");
+    ResR = Builder.CreateAdd(Op.LHS.first, Op.RHS.first, "add.r");
     assert(Op.LHS.second && Op.RHS.second &&
            "Both operands of integer complex operators must be complex!");
     ResI = Builder.CreateAdd(Op.LHS.second, Op.RHS.second, "add.i");
@@ -884,11 +881,13 @@ ComplexPairTy ComplexExprEmitter::EmitBinMul(const BinOpInfo &Op) {
       // Finally continue execution by phi-ing together the different
       // computation paths.
       CGF.EmitBlock(ContBB);
-      llvm::PHINode *RealPHI = Builder.CreatePHI(ResR->getType(), 3, "real_mul_phi");
+      llvm::PHINode *RealPHI =
+          Builder.CreatePHI(ResR->getType(), 3, "real_mul_phi");
       RealPHI->addIncoming(ResR, OrigBB);
       RealPHI->addIncoming(ResR, INaNBB);
       RealPHI->addIncoming(LibCallR, LibCallBB);
-      llvm::PHINode *ImagPHI = Builder.CreatePHI(ResI->getType(), 3, "imag_mul_phi");
+      llvm::PHINode *ImagPHI =
+          Builder.CreatePHI(ResI->getType(), 3, "imag_mul_phi");
       ImagPHI->addIncoming(ResI, OrigBB);
       ImagPHI->addIncoming(ResI, INaNBB);
       ImagPHI->addIncoming(LibCallI, LibCallBB);
@@ -1097,7 +1096,9 @@ ComplexPairTy ComplexExprEmitter::EmitBinDiv(const BinOpInfo &Op) {
     llvm::Value *Tmp8 = Builder.CreateMul(LHSr, RHSi); // a*d
     llvm::Value *Tmp9 = Builder.CreateSub(Tmp7, Tmp8); // bc-ad
 
-    if (Op.Ty->castAs<ComplexType>()->getElementType()->isUnsignedIntegerType()) {
+    if (Op.Ty->castAs<ComplexType>()
+            ->getElementType()
+            ->isUnsignedIntegerType()) {
       DSTr = Builder.CreateUDiv(Tmp3, Tmp6);
       DSTi = Builder.CreateUDiv(Tmp9, Tmp6);
     } else {
@@ -1209,11 +1210,9 @@ ComplexExprEmitter::EmitBinOps(const BinaryOperator *E,
   return Ops;
 }
 
-
-LValue ComplexExprEmitter::
-EmitCompoundAssignLValue(const CompoundAssignOperator *E,
-          ComplexPairTy (ComplexExprEmitter::*Func)(const BinOpInfo&),
-                         RValue &Val) {
+LValue ComplexExprEmitter::EmitCompoundAssignLValue(
+    const CompoundAssignOperator *E,
+    ComplexPairTy (ComplexExprEmitter::*Func)(const BinOpInfo &), RValue &Val) {
   TestAndClearIgnoreReal();
   TestAndClearIgnoreImag();
   QualType LHSTy = E->getLHS()->getType();
@@ -1323,9 +1322,9 @@ EmitCompoundAssignLValue(const CompoundAssignOperator *E,
 }
 
 // Compound assignments.
-ComplexPairTy ComplexExprEmitter::
-EmitCompoundAssign(const CompoundAssignOperator *E,
-                   ComplexPairTy (ComplexExprEmitter::*Func)(const BinOpInfo&)){
+ComplexPairTy ComplexExprEmitter::EmitCompoundAssign(
+    const CompoundAssignOperator *E,
+    ComplexPairTy (ComplexExprEmitter::*Func)(const BinOpInfo &)) {
   RValue Val;
   LValue LV = EmitCompoundAssignLValue(E, Func, Val);
 
@@ -1381,8 +1380,8 @@ ComplexPairTy ComplexExprEmitter::VisitBinComma(const BinaryOperator *E) {
   return Visit(E->getRHS());
 }
 
-ComplexPairTy ComplexExprEmitter::
-VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
+ComplexPairTy ComplexExprEmitter::VisitAbstractConditionalOperator(
+    const AbstractConditionalOperator *E) {
   TestAndClearIgnoreReal();
   TestAndClearIgnoreImag();
   llvm::BasicBlock *LHSBlock = CGF.createBasicBlock("cond.true");
@@ -1392,7 +1391,6 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   // Bind the common expression if necessary.
   CodeGenFunction::OpaqueValueMapping binding(CGF, E);
 
-
   CodeGenFunction::ConditionalEvaluation eval(CGF);
   CGF.EmitBranchOnBoolExpr(E->getCond(), LHSBlock, RHSBlock,
                            CGF.getProfileCount(E));
@@ -1431,12 +1429,12 @@ ComplexPairTy ComplexExprEmitter::VisitChooseExpr(ChooseExpr *E) {
 }
 
 ComplexPairTy ComplexExprEmitter::VisitInitListExpr(InitListExpr *E) {
-    bool Ignore = TestAndClearIgnoreReal();
-    (void)Ignore;
-    assert (Ignore == false && "init list ignored");
-    Ignore = TestAndClearIgnoreImag();
-    (void)Ignore;
-    assert (Ignore == false && "init list ignored");
+  bool Ignore = TestAndClearIgnoreReal();
+  (void)Ignore;
+  assert(Ignore == false && "init list ignored");
+  Ignore = TestAndClearIgnoreImag();
+  (void)Ignore;
+  assert(Ignore == false && "init list ignored");
 
   if (E->getNumInits() == 2) {
     llvm::Value *Real = CGF.EmitScalarExpr(E->getInit(0));
@@ -1449,8 +1447,8 @@ ComplexPairTy ComplexExprEmitter::VisitInitListExpr(InitListExpr *E) {
   // Empty init list initializes to null
   assert(E->getNumInits() == 0 && "Unexpected number of inits");
   QualType Ty = E->getType()->castAs<ComplexType>()->getElementType();
-  llvm::Type* LTy = CGF.ConvertType(Ty);
-  llvm::Value* zeroConstant = llvm::Constant::getNullValue(LTy);
+  llvm::Type *LTy = CGF.ConvertType(Ty);
+  llvm::Value *zeroConstant = llvm::Constant::getNullValue(LTy);
   return ComplexPairTy(zeroConstant, zeroConstant);
 }
 
@@ -1461,7 +1459,7 @@ ComplexPairTy ComplexExprEmitter::VisitVAArgExpr(VAArgExpr *E) {
   if (!ArgValue.isValid()) {
     CGF.ErrorUnsupported(E, "complex va_arg expression");
     llvm::Type *EltTy =
-      CGF.ConvertType(E->getType()->castAs<ComplexType>()->getElementType());
+        CGF.ConvertType(E->getType()->castAs<ComplexType>()->getElementType());
     llvm::Value *U = llvm::PoisonValue::get(EltTy);
     return ComplexPairTy(U, U);
   }
@@ -1489,7 +1487,7 @@ void CodeGenFunction::EmitComplexExprIntoLValue(const Expr *E, LValue dest,
   assert(E && getComplexType(E->getType()) &&
          "Invalid complex expression to emit");
   ComplexExprEmitter Emitter(*this);
-  ComplexPairTy Val = Emitter.Visit(const_cast<Expr*>(E));
+  ComplexPairTy Val = Emitter.Visit(const_cast<Expr *>(E));
   Emitter.EmitStoreOfComplex(Val, dest, isInit);
 }
 
@@ -1520,26 +1518,29 @@ typedef ComplexPairTy (ComplexExprEmitter::*CompoundFunc)(
 
 static CompoundFunc getComplexOp(BinaryOperatorKind Op) {
   switch (Op) {
-  case BO_MulAssign: return &ComplexExprEmitter::EmitBinMul;
-  case BO_DivAssign: return &ComplexExprEmitter::EmitBinDiv;
-  case BO_SubAssign: return &ComplexExprEmitter::EmitBinSub;
-  case BO_AddAssign: return &ComplexExprEmitter::EmitBinAdd;
+  case BO_MulAssign:
+    return &ComplexExprEmitter::EmitBinMul;
+  case BO_DivAssign:
+    return &ComplexExprEmitter::EmitBinDiv;
+  case BO_SubAssign:
+    return &ComplexExprEmitter::EmitBinSub;
+  case BO_AddAssign:
+    return &ComplexExprEmitter::EmitBinAdd;
   default:
     llvm_unreachable("unexpected complex compound assignment");
   }
 }
 
-LValue CodeGenFunction::
-EmitComplexCompoundAssignmentLValue(const CompoundAssignOperator *E) {
+LValue CodeGenFunction::EmitComplexCompoundAssignmentLValue(
+    const CompoundAssignOperator *E) {
   ApplyAtomGroup Grp(getDebugInfo());
   CompoundFunc Op = getComplexOp(E->getOpcode());
   RValue Val;
   return ComplexExprEmitter(*this).EmitCompoundAssignLValue(E, Op, Val);
 }
 
-LValue CodeGenFunction::
-EmitScalarCompoundAssignWithComplex(const CompoundAssignOperator *E,
-                                    llvm::Value *&Result) {
+LValue CodeGenFunction::EmitScalarCompoundAssignWithComplex(
+    const CompoundAssignOperator *E, llvm::Value *&Result) {
   // Key Instructions: Don't need to create an atom group here; one will already
   // be active through scalar handling code.
   CompoundFunc Op = getComplexOp(E->getOpcode());
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 2578157395b45..c8a0ab34ae848 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -116,12 +116,12 @@ static const ValueDecl *getArrayDecl(const ArraySubscriptExpr *ASE) {
   return getArrayDecl(E);
 }
 
-// Get the total size of the array, or -1 if the array is unbounded.
+// Get the total size of the array, or 0 if the array is unbounded.
 static int getTotalArraySize(ASTContext &AST, const clang::Type *Ty) {
   Ty = Ty->getUnqualifiedDesugaredType();
   assert(Ty->isArrayType() && "expected array type");
   if (Ty->isIncompleteArrayType())
-    return -1;
+    return 0;
   return AST.getConstantArrayElementCount(cast<ConstantArrayType>(Ty));
 }
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index fa3dfcd856b30..ca1a1c0321fce 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1529,6 +1529,7 @@ convertCaptureClause(const VarDecl *VD) {
     return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
     break;
   case OMPDeclareTargetDeclAttr::MapTypeTy::MT_Enter:
+  case OMPDeclareTargetDeclAttr::MapTypeTy::MT_Local:
     return llvm::OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter;
     break;
   case OMPDeclareTargetDeclAttr::MapTypeTy::MT_Link:
@@ -7980,7 +7981,8 @@ class MappableExprsHandler {
                 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD)) {
           if ((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
               ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-                *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+                *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+                *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
                CGF.CGM.getOpenMPRuntime().hasRequiresUnifiedSharedMemory())) {
             RequiresReference = true;
             BP = CGF.CGM.getOpenMPRuntime().getAddrOfDeclareTargetVar(VD);
@@ -11318,7 +11320,8 @@ bool CGOpenMPRuntime::emitTargetGlobalVariable(GlobalDecl GD) {
           cast<VarDecl>(GD.getDecl()));
   if (!Res || *Res == OMPDeclareTargetDeclAttr::MT_Link ||
       ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-        *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+        *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+        *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
        HasRequiresUnifiedSharedMemory)) {
     DeferredGlobalVariables.insert(cast<VarDecl>(GD.getDecl()));
     return true;
@@ -11388,13 +11391,15 @@ void CGOpenMPRuntime::emitDeferredTargetDecls() const {
     if (!Res)
       continue;
     if ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-         *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+         *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+         *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
         !HasRequiresUnifiedSharedMemory) {
       CGM.EmitGlobal(VD);
     } else {
       assert((*Res == OMPDeclareTargetDeclAttr::MT_Link ||
               ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-                *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+                *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+                *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
                HasRequiresUnifiedSharedMemory)) &&
              "Expected link clause or to clause with unified memory.");
       (void)CGM.getOpenMPRuntime().getAddrOfDeclareTargetVar(VD);
@@ -11853,27 +11858,9 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall(
   }
 }
 
-namespace {
-  /// Kind of parameter in a function with 'declare simd' directive.
-enum ParamKindTy {
-  Linear,
-  LinearRef,
-  LinearUVal,
-  LinearVal,
-  Uniform,
-  Vector,
-};
-/// Attribute set of the parameter.
-struct ParamAttrTy {
-  ParamKindTy Kind = Vector;
-  llvm::APSInt StrideOrArg;
-  llvm::APSInt Alignment;
-  bool HasVarStride = false;
-};
-} // namespace
-
-static unsigned evaluateCDTSize(const FunctionDecl *FD,
-                                ArrayRef<ParamAttrTy> ParamAttrs) {
+static unsigned
+evaluateCDTSize(const FunctionDecl *FD,
+                ArrayRef<llvm::OpenMPIRBuilder::DeclareSimdAttrTy> ParamAttrs) {
   // Every vector variant of a SIMD-enabled function has a vector length (VLEN).
   // If OpenMP clause "simdlen" is used, the VLEN is the value of the argument
   // of that clause. The VLEN value must be power of 2.
@@ -11903,13 +11890,15 @@ static unsigned evaluateCDTSize(const FunctionDecl *FD,
   } else {
     unsigned Offset = 0;
     if (const auto *MD = dyn_cast<CXXMethodDecl>(FD)) {
-      if (ParamAttrs[Offset].Kind == Vector)
+      if (ParamAttrs[Offset].Kind ==
+          llvm::OpenMPIRBuilder::DeclareSimdKindTy::Vector)
         CDT = C.getPointerType(C.getCanonicalTagType(MD->getParent()));
       ++Offset;
     }
     if (CDT.isNull()) {
       for (unsigned I = 0, E = FD->getNumParams(); I < E; ++I) {
-        if (ParamAttrs[I + Offset].Kind == Vector) {
+        if (ParamAttrs[I + Offset].Kind ==
+            llvm::OpenMPIRBuilder::DeclareSimdKindTy::Vector) {
           CDT = FD->getParamDecl(I)->getType();
           break;
         }
@@ -11924,107 +11913,6 @@ static unsigned evaluateCDTSize(const FunctionDecl *FD,
   return C.getTypeSize(CDT);
 }
 
-/// Mangle the parameter part of the vector function name according to
-/// their OpenMP classification. The mangling function is defined in
-/// section 4.5 of the AAVFABI(2021Q1).
-static std::string mangleVectorParameters(ArrayRef<ParamAttrTy> ParamAttrs) {
-  SmallString<256> Buffer;
-  llvm::raw_svector_ostream Out(Buffer);
-  for (const auto &ParamAttr : ParamAttrs) {
-    switch (ParamAttr.Kind) {
-    case Linear:
-      Out << 'l';
-      break;
-    case LinearRef:
-      Out << 'R';
-      break;
-    case LinearUVal:
-      Out << 'U';
-      break;
-    case LinearVal:
-      Out << 'L';
-      break;
-    case Uniform:
-      Out << 'u';
-      break;
-    case Vector:
-      Out << 'v';
-      break;
-    }
-    if (ParamAttr.HasVarStride)
-      Out << "s" << ParamAttr.StrideOrArg;
-    else if (ParamAttr.Kind == Linear || ParamAttr.Kind == LinearRef ||
-             ParamAttr.Kind == LinearUVal || ParamAttr.Kind == LinearVal) {
-      // Don't print the step value if it is not present or if it is
-      // equal to 1.
-      if (ParamAttr.StrideOrArg < 0)
-        Out << 'n' << -ParamAttr.StrideOrArg;
-      else if (ParamAttr.StrideOrArg != 1)
-        Out << ParamAttr.StrideOrArg;
-    }
-
-    if (!!ParamAttr.Alignment)
-      Out << 'a' << ParamAttr.Alignment;
-  }
-
-  return std::string(Out.str());
-}
-
-static void
-emitX86DeclareSimdFunction(const FunctionDecl *FD, llvm::Function *Fn,
-                           const llvm::APSInt &VLENVal,
-                           ArrayRef<ParamAttrTy> ParamAttrs,
-                           OMPDeclareSimdDeclAttr::BranchStateTy State) {
-  struct ISADataTy {
-    char ISA;
-    unsigned VecRegSize;
-  };
-  ISADataTy ISAData[] = {
-      {
-          'b', 128
-      }, // SSE
-      {
-          'c', 256
-      }, // AVX
-      {
-          'd', 256
-      }, // AVX2
-      {
-          'e', 512
-      }, // AVX512
-  };
-  llvm::SmallVector<char, 2> Masked;
-  switch (State) {
-  case OMPDeclareSimdDeclAttr::BS_Undefined:
-    Masked.push_back('N');
-    Masked.push_back('M');
-    break;
-  case OMPDeclareSimdDeclAttr::BS_Notinbranch:
-    Masked.push_back('N');
-    break;
-  case OMPDeclareSimdDeclAttr::BS_Inbranch:
-    Masked.push_back('M');
-    break;
-  }
-  for (char Mask : Masked) {
-    for (const ISADataTy &Data : ISAData) {
-      SmallString<256> Buffer;
-      llvm::raw_svector_ostream Out(Buffer);
-      Out << "_ZGV" << Data.ISA << Mask;
-      if (!VLENVal) {
-        unsigned NumElts = evaluateCDTSize(FD, ParamAttrs);
-        assert(NumElts && "Non-zero simdlen/cdtsize expected");
-        Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
-      } else {
-        Out << VLENVal;
-      }
-      Out << mangleVectorParameters(ParamAttrs);
-      Out << '_' << Fn->getName();
-      Fn->addFnAttr(Out.str());
-    }
-  }
-}
-
 // This are the Functions that are needed to mangle the name of the
 // vector functions generated by the compiler, according to the rules
 // defined in the "Vector Function ABI specifications for AArch64",
@@ -12032,19 +11920,22 @@ emitX86DeclareSimdFunction(const FunctionDecl *FD, llvm::Function *Fn,
 // https://developer.arm.com/products/software-development-tools/hpc/arm-compiler-for-hpc/vector-function-abi.
 
 /// Maps To Vector (MTV), as defined in 4.1.1 of the AAVFABI (2021Q1).
-static bool getAArch64MTV(QualType QT, ParamKindTy Kind) {
+static bool getAArch64MTV(QualType QT,
+                          llvm::OpenMPIRBuilder::DeclareSimdKindTy Kind) {
   QT = QT.getCanonicalType();
 
   if (QT->isVoidType())
     return false;
 
-  if (Kind == ParamKindTy::Uniform)
+  if (Kind == llvm::OpenMPIRBuilder::DeclareSimdKindTy::Uniform)
     return false;
 
-  if (Kind == ParamKindTy::LinearUVal || Kind == ParamKindTy::LinearRef)
+  if (Kind == llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearUVal ||
+      Kind == llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearRef)
     return false;
 
-  if ((Kind == ParamKindTy::Linear || Kind == ParamKindTy::LinearVal) &&
+  if ((Kind == llvm::OpenMPIRBuilder::DeclareSimdKindTy::Linear ||
+       Kind == llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearVal) &&
       !QT->isReferenceType())
     return false;
 
@@ -12077,7 +11968,9 @@ static bool getAArch64PBV(QualType QT, ASTContext &C) {
 /// Computes the lane size (LS) of a return type or of an input parameter,
 /// as defined by `LS(P)` in 3.2.1 of the AAVFABI.
 /// TODO: Add support for references, section 3.2.1, item 1.
-static unsigned getAArch64LS(QualType QT, ParamKindTy Kind, ASTContext &C) {
+static unsigned getAArch64LS(QualType QT,
+                             llvm::OpenMPIRBuilder::DeclareSimdKindTy Kind,
+                             ASTContext &C) {
   if (!getAArch64MTV(QT, Kind) && QT.getCanonicalType()->isPointerType()) {
     QualType PTy = QT.getCanonicalType()->getPointeeType();
     if (getAArch64PBV(PTy, C))
@@ -12093,7 +11986,8 @@ static unsigned getAArch64LS(QualType QT, ParamKindTy Kind, ASTContext &C) {
 // signature of the scalar function, as defined in 3.2.2 of the
 // AAVFABI.
 static std::tuple<unsigned, unsigned, bool>
-getNDSWDS(const FunctionDecl *FD, ArrayRef<ParamAttrTy> ParamAttrs) {
+getNDSWDS(const FunctionDecl *FD,
+          ArrayRef<llvm::OpenMPIRBuilder::DeclareSimdAttrTy> ParamAttrs) {
   QualType RetType = FD->getReturnType().getCanonicalType();
 
   ASTContext &C = FD->getASTContext();
@@ -12102,7 +11996,8 @@ getNDSWDS(const FunctionDecl *FD, ArrayRef<ParamAttrTy> ParamAttrs) {
 
   llvm::SmallVector<unsigned, 8> Sizes;
   if (!RetType->isVoidType()) {
-    Sizes.push_back(getAArch64LS(RetType, ParamKindTy::Vector, C));
+    Sizes.push_back(getAArch64LS(
+        RetType, llvm::OpenMPIRBuilder::DeclareSimdKindTy::Vector, C));
     if (!getAArch64PBV(RetType, C) && getAArch64MTV(RetType, {}))
       OutputBecomesInput = true;
   }
@@ -12125,155 +12020,43 @@ getNDSWDS(const FunctionDecl *FD, ArrayRef<ParamAttrTy> ParamAttrs) {
                          OutputBecomesInput);
 }
 
-// Function used to add the attribute. The parameter `VLEN` is
-// templated to allow the use of "x" when targeting scalable functions
-// for SVE.
-template <typename T>
-static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
-                                 char ISA, StringRef ParSeq,
-                                 StringRef MangledName, bool OutputBecomesInput,
-                                 llvm::Function *Fn) {
-  SmallString<256> Buffer;
-  llvm::raw_svector_ostream Out(Buffer);
-  Out << Prefix << ISA << LMask << VLEN;
-  if (OutputBecomesInput)
-    Out << "v";
-  Out << ParSeq << "_" << MangledName;
-  Fn->addFnAttr(Out.str());
-}
-
-// Helper function to generate the Advanced SIMD names depending on
-// the value of the NDS when simdlen is not present.
-static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
-                                      StringRef Prefix, char ISA,
-                                      StringRef ParSeq, StringRef MangledName,
-                                      bool OutputBecomesInput,
-                                      llvm::Function *Fn) {
-  switch (NDS) {
-  case 8:
-    addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
-                         OutputBecomesInput, Fn);
-    addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
-                         OutputBecomesInput, Fn);
-    break;
-  case 16:
-    addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
-                         OutputBecomesInput, Fn);
-    addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
-                         OutputBecomesInput, Fn);
-    break;
-  case 32:
-    addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
-                         OutputBecomesInput, Fn);
-    addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
-                         OutputBecomesInput, Fn);
-    break;
-  case 64:
-  case 128:
-    addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
-                         OutputBecomesInput, Fn);
-    break;
-  default:
-    llvm_unreachable("Scalar type is too wide.");
+static llvm::OpenMPIRBuilder::DeclareSimdBranch
+convertDeclareSimdBranch(OMPDeclareSimdDeclAttr::BranchStateTy State) {
+  switch (State) {
+  case OMPDeclareSimdDeclAttr::BS_Undefined:
+    return llvm::OpenMPIRBuilder::DeclareSimdBranch::Undefined;
+  case OMPDeclareSimdDeclAttr::BS_Inbranch:
+    return llvm::OpenMPIRBuilder::DeclareSimdBranch::Inbranch;
+  case OMPDeclareSimdDeclAttr::BS_Notinbranch:
+    return llvm::OpenMPIRBuilder::DeclareSimdBranch::Notinbranch;
   }
+  llvm_unreachable("unexpected declare simd branch state");
 }
 
-/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
-static void emitAArch64DeclareSimdFunction(
-    CodeGenModule &CGM, const FunctionDecl *FD, unsigned UserVLEN,
-    ArrayRef<ParamAttrTy> ParamAttrs,
-    OMPDeclareSimdDeclAttr::BranchStateTy State, StringRef MangledName,
-    char ISA, unsigned VecRegSize, llvm::Function *Fn, SourceLocation SLoc) {
-
-  // Get basic data for building the vector signature.
-  const auto Data = getNDSWDS(FD, ParamAttrs);
-  const unsigned NDS = std::get<0>(Data);
-  const unsigned WDS = std::get<1>(Data);
-  const bool OutputBecomesInput = std::get<2>(Data);
-
-  // Check the values provided via `simdlen` by the user.
-  // 1. A `simdlen(1)` doesn't produce vector signatures,
+// Check the values provided via `simdlen` by the user.
+static bool validateAArch64Simdlen(CodeGenModule &CGM, SourceLocation SLoc,
+                                   unsigned UserVLEN, unsigned WDS, char ISA) {
+  // 1. A `simdlen(1)` doesn't produce vector signatures.
   if (UserVLEN == 1) {
     CGM.getDiags().Report(SLoc, diag::warn_simdlen_1_no_effect);
-    return;
+    return false;
   }
 
-  // 2. Section 3.3.1, item 1: user input must be a power of 2 for
-  // Advanced SIMD output.
+  // 2. Section 3.3.1, item 1: user input must be a power of 2 for Advanced
+  // SIMD.
   if (ISA == 'n' && UserVLEN && !llvm::isPowerOf2_32(UserVLEN)) {
     CGM.getDiags().Report(SLoc, diag::warn_simdlen_requires_power_of_2);
-    return;
+    return false;
   }
 
-  // 3. Section 3.4.1. SVE fixed lengh must obey the architectural
-  // limits.
-  if (ISA == 's' && UserVLEN != 0) {
-    if ((UserVLEN * WDS > 2048) || (UserVLEN * WDS % 128 != 0)) {
-      CGM.getDiags().Report(SLoc, diag::warn_simdlen_must_fit_lanes) << WDS;
-      return;
-    }
+  // 3. Section 3.4.1: SVE fixed length must obey the architectural limits.
+  if (ISA == 's' && UserVLEN != 0 &&
+      ((UserVLEN * WDS > 2048) || (UserVLEN * WDS % 128 != 0))) {
+    CGM.getDiags().Report(SLoc, diag::warn_simdlen_must_fit_lanes) << WDS;
+    return false;
   }
 
-  // Sort out parameter sequence.
-  const std::string ParSeq = mangleVectorParameters(ParamAttrs);
-  StringRef Prefix = "_ZGV";
-  // Generate simdlen from user input (if any).
-  if (UserVLEN) {
-    if (ISA == 's') {
-      // SVE generates only a masked function.
-      addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
-                           OutputBecomesInput, Fn);
-    } else {
-      assert(ISA == 'n' && "Expected ISA either 's' or 'n'.");
-      // Advanced SIMD generates one or two functions, depending on
-      // the `[not]inbranch` clause.
-      switch (State) {
-      case OMPDeclareSimdDeclAttr::BS_Undefined:
-        addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
-                             OutputBecomesInput, Fn);
-        addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
-                             OutputBecomesInput, Fn);
-        break;
-      case OMPDeclareSimdDeclAttr::BS_Notinbranch:
-        addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
-                             OutputBecomesInput, Fn);
-        break;
-      case OMPDeclareSimdDeclAttr::BS_Inbranch:
-        addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
-                             OutputBecomesInput, Fn);
-        break;
-      }
-    }
-  } else {
-    // If no user simdlen is provided, follow the AAVFABI rules for
-    // generating the vector length.
-    if (ISA == 's') {
-      // SVE, section 3.4.1, item 1.
-      addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
-                           OutputBecomesInput, Fn);
-    } else {
-      assert(ISA == 'n' && "Expected ISA either 's' or 'n'.");
-      // Advanced SIMD, Section 3.3.1 of the AAVFABI, generates one or
-      // two vector names depending on the use of the clause
-      // `[not]inbranch`.
-      switch (State) {
-      case OMPDeclareSimdDeclAttr::BS_Undefined:
-        addAArch64AdvSIMDNDSNames(NDS, "N", Prefix, ISA, ParSeq, MangledName,
-                                  OutputBecomesInput, Fn);
-        addAArch64AdvSIMDNDSNames(NDS, "M", Prefix, ISA, ParSeq, MangledName,
-                                  OutputBecomesInput, Fn);
-        break;
-      case OMPDeclareSimdDeclAttr::BS_Notinbranch:
-        addAArch64AdvSIMDNDSNames(NDS, "N", Prefix, ISA, ParSeq, MangledName,
-                                  OutputBecomesInput, Fn);
-        break;
-      case OMPDeclareSimdDeclAttr::BS_Inbranch:
-        addAArch64AdvSIMDNDSNames(NDS, "M", Prefix, ISA, ParSeq, MangledName,
-                                  OutputBecomesInput, Fn);
-        break;
-      }
-    }
-  }
+  return true;
 }
 
 void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
@@ -12291,7 +12074,8 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
       ++ParamPos;
     }
     for (const auto *Attr : FD->specific_attrs<OMPDeclareSimdDeclAttr>()) {
-      llvm::SmallVector<ParamAttrTy, 8> ParamAttrs(ParamPositions.size());
+      llvm::SmallVector<llvm::OpenMPIRBuilder::DeclareSimdAttrTy, 8> ParamAttrs(
+          ParamPositions.size());
       // Mark uniform parameters.
       for (const Expr *E : Attr->uniforms()) {
         E = E->IgnoreParenImpCasts();
@@ -12305,7 +12089,8 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
           assert(It != ParamPositions.end() && "Function parameter not found");
           Pos = It->second;
         }
-        ParamAttrs[Pos].Kind = Uniform;
+        ParamAttrs[Pos].Kind =
+            llvm::OpenMPIRBuilder::DeclareSimdKindTy::Uniform;
       }
       // Get alignment info.
       auto *NI = Attr->alignments_begin();
@@ -12366,15 +12151,15 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
                     .getQuantity();
           }
         }
-        ParamAttrTy &ParamAttr = ParamAttrs[Pos];
+        llvm::OpenMPIRBuilder::DeclareSimdAttrTy &ParamAttr = ParamAttrs[Pos];
         if (*MI == OMPC_LINEAR_ref)
-          ParamAttr.Kind = LinearRef;
+          ParamAttr.Kind = llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearRef;
         else if (*MI == OMPC_LINEAR_uval)
-          ParamAttr.Kind = LinearUVal;
+          ParamAttr.Kind = llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearUVal;
         else if (IsReferenceType)
-          ParamAttr.Kind = LinearVal;
+          ParamAttr.Kind = llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearVal;
         else
-          ParamAttr.Kind = Linear;
+          ParamAttr.Kind = llvm::OpenMPIRBuilder::DeclareSimdKindTy::Linear;
         // Assuming a stride of 1, for `linear` without modifiers.
         ParamAttr.StrideOrArg = llvm::APSInt::getUnsigned(1);
         if (*SI) {
@@ -12399,7 +12184,10 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
         // rescale the value of linear_step with the byte size of the
         // pointee type.
         if (!ParamAttr.HasVarStride &&
-            (ParamAttr.Kind == Linear || ParamAttr.Kind == LinearRef))
+            (ParamAttr.Kind ==
+                 llvm::OpenMPIRBuilder::DeclareSimdKindTy::Linear ||
+             ParamAttr.Kind ==
+                 llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearRef))
           ParamAttr.StrideOrArg = ParamAttr.StrideOrArg * PtrRescalingFactor;
         ++SI;
         ++MI;
@@ -12411,18 +12199,29 @@ void CGOpenMPRuntime::emitDeclareSimdFunction(const FunctionDecl *FD,
         VLENVal = VLENExpr->EvaluateKnownConstInt(C);
         ExprLoc = VLENExpr->getExprLoc();
       }
-      OMPDeclareSimdDeclAttr::BranchStateTy State = Attr->getBranchState();
+      llvm::OpenMPIRBuilder::DeclareSimdBranch State =
+          convertDeclareSimdBranch(Attr->getBranchState());
       if (CGM.getTriple().isX86()) {
-        emitX86DeclareSimdFunction(FD, Fn, VLENVal, ParamAttrs, State);
+        unsigned NumElts = evaluateCDTSize(FD, ParamAttrs);
+        assert(NumElts && "Non-zero simdlen/cdtsize expected");
+        OMPBuilder.emitX86DeclareSimdFunction(Fn, NumElts, VLENVal, ParamAttrs,
+                                              State);
       } else if (CGM.getTriple().getArch() == llvm::Triple::aarch64) {
         unsigned VLEN = VLENVal.getExtValue();
-        StringRef MangledName = Fn->getName();
-        if (CGM.getTarget().hasFeature("sve"))
-          emitAArch64DeclareSimdFunction(CGM, FD, VLEN, ParamAttrs, State,
-                                         MangledName, 's', 128, Fn, ExprLoc);
-        else if (CGM.getTarget().hasFeature("neon"))
-          emitAArch64DeclareSimdFunction(CGM, FD, VLEN, ParamAttrs, State,
-                                         MangledName, 'n', 128, Fn, ExprLoc);
+        // Get basic data for building the vector signature.
+        const auto Data = getNDSWDS(FD, ParamAttrs);
+        const unsigned NDS = std::get<0>(Data);
+        const unsigned WDS = std::get<1>(Data);
+        const bool OutputBecomesInput = std::get<2>(Data);
+        if (CGM.getTarget().hasFeature("sve")) {
+          if (validateAArch64Simdlen(CGM, ExprLoc, VLEN, WDS, 's'))
+            OMPBuilder.emitAArch64DeclareSimdFunction(
+                Fn, VLEN, ParamAttrs, State, 's', NDS, OutputBecomesInput);
+        } else if (CGM.getTarget().hasFeature("neon")) {
+          if (validateAArch64Simdlen(CGM, ExprLoc, VLEN, WDS, 'n'))
+            OMPBuilder.emitAArch64DeclareSimdFunction(
+                Fn, VLEN, ParamAttrs, State, 'n', NDS, OutputBecomesInput);
+        }
       }
     }
     FD = FD->getPreviousDecl();
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index d08fe0feb692e..43af159305953 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -4420,13 +4420,15 @@ void CodeGenModule::EmitGlobal(GlobalDecl GD) {
           bool UnifiedMemoryEnabled =
               getOpenMPRuntime().hasRequiresUnifiedSharedMemory();
           if ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-               *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+               *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+               *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
               !UnifiedMemoryEnabled) {
             (void)GetAddrOfGlobalVar(VD);
           } else {
             assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
                     ((*Res == OMPDeclareTargetDeclAttr::MT_To ||
-                      *Res == OMPDeclareTargetDeclAttr::MT_Enter) &&
+                      *Res == OMPDeclareTargetDeclAttr::MT_Enter ||
+                      *Res == OMPDeclareTargetDeclAttr::MT_Local) &&
                      UnifiedMemoryEnabled)) &&
                    "Link clause or to clause with unified memory expected.");
             (void)getOpenMPRuntime().getAddrOfDeclareTargetVar(VD);
@@ -8538,3 +8540,55 @@ std::string CodeGenModule::getPFPFieldName(const FieldDecl *FD) {
   Out << "." << FD->getName();
   return OutName;
 }
+
+bool CodeGenModule::classNeedsVectorDestructor(const CXXRecordDecl *RD) {
+  if (!Context.getTargetInfo().emitVectorDeletingDtors(Context.getLangOpts()))
+    return false;
+  CXXDestructorDecl *Dtor = RD->getDestructor();
+  // The compiler can't know if new[]/delete[] will be used outside of the DLL,
+  // so just force vector deleting destructor emission if dllexport is present.
+  // This matches MSVC behavior.
+  if (Dtor && Dtor->isVirtual() && Dtor->hasAttr<DLLExportAttr>())
+    return true;
+
+  return RequireVectorDeletingDtor.count(RD);
+}
+
+void CodeGenModule::requireVectorDestructorDefinition(const CXXRecordDecl *RD) {
+  if (!Context.getTargetInfo().emitVectorDeletingDtors(Context.getLangOpts()))
+    return;
+  RequireVectorDeletingDtor.insert(RD);
+
+  // To reduce code size in general case we lazily emit scalar deleting
+  // destructor definition and an alias from vector deleting destructor to
+  // scalar deleting destructor. It may happen that we first emitted the scalar
+  // deleting destructor definition and the alias and then discovered that the
+  // definition of the vector deleting destructor is required. Then we need to
+  // remove the alias and the scalar deleting destructor and queue vector
+  // deleting destructor body for emission. Check if that is the case.
+  CXXDestructorDecl *DtorD = RD->getDestructor();
+  GlobalDecl ScalarDtorGD(DtorD, Dtor_Deleting);
+  StringRef MangledName = getMangledName(ScalarDtorGD);
+  llvm::GlobalValue *Entry = GetGlobalValue(MangledName);
+  GlobalDecl VectorDtorGD(DtorD, Dtor_VectorDeleting);
+  if (Entry && !Entry->isDeclaration()) {
+    StringRef VDName = getMangledName(VectorDtorGD);
+    llvm::GlobalValue *VDEntry = GetGlobalValue(VDName);
+    // It exists and it should be an alias.
+    assert(VDEntry && isa<llvm::GlobalAlias>(VDEntry));
+    auto *NewFn = llvm::Function::Create(
+        cast<llvm::FunctionType>(VDEntry->getValueType()),
+        llvm::Function::ExternalLinkage, VDName, &getModule());
+    SetFunctionAttributes(VectorDtorGD, NewFn, /*IsIncompleteFunction*/ false,
+                          /*IsThunk*/ false);
+    NewFn->takeName(VDEntry);
+    VDEntry->replaceAllUsesWith(NewFn);
+    VDEntry->eraseFromParent();
+    Entry->replaceAllUsesWith(NewFn);
+    Entry->eraseFromParent();
+  }
+  // Always add a deferred decl to emit once we confirmed that vector deleting
+  // destructor definition is required. That helps to enforse its generation
+  // even if destructor is only declared.
+  addDeferredDeclToEmit(VectorDtorGD);
+}
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 0081bf5c4cf5f..0a697c84b66a7 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -529,6 +529,11 @@ class CodeGenModule : public CodeGenTypeCache {
   /// that we don't re-emit the initializer.
   llvm::DenseMap<const Decl*, unsigned> DelayedCXXInitPosition;
 
+  /// To remember which types did require a vector deleting destructor body.
+  /// This set basically contains classes that have virtual destructor and new[]
+  /// was emitted for the class.
+  llvm::SmallPtrSet<const CXXRecordDecl *, 16> RequireVectorDeletingDtor;
+
   typedef std::pair<OrderGlobalInitsOrStermFinalizers, llvm::Function *>
       GlobalInitData;
 
@@ -1578,6 +1583,13 @@ class CodeGenModule : public CodeGenTypeCache {
   /// are emitted lazily.
   void EmitGlobal(GlobalDecl D);
 
+  /// Record that new[] was called for the class, transform vector deleting
+  /// destructor definition in a form of alias to the actual definition.
+  void requireVectorDestructorDefinition(const CXXRecordDecl *RD);
+
+  /// Check that class need vector deleting destructor body.
+  bool classNeedsVectorDestructor(const CXXRecordDecl *RD);
+
   bool TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D);
   void EmitDefinitionAsAlias(GlobalDecl Alias, GlobalDecl Target);
 
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 8a06051a1c730..6d21db0e26f5e 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -2979,8 +2979,12 @@ static void emitGlobalDtorWithCXAAtExit(CodeGenFunction &CGF,
       /*IsVariadic=*/false, /*IsCXXMethod=*/false));
   QualType fnType =
       Context.getFunctionType(Context.VoidTy, {Context.VoidPtrTy}, EPI);
-  llvm::Constant *dtorCallee = cast<llvm::Constant>(dtor.getCallee());
-  dtorCallee = CGF.CGM.getFunctionPointer(dtorCallee, fnType);
+  llvm::Value *dtorCallee = dtor.getCallee();
+  dtorCallee =
+      CGF.CGM.getFunctionPointer(cast<llvm::Constant>(dtorCallee), fnType);
+
+  if (dtorCallee->getType()->getPointerAddressSpace() != AddrAS)
+    dtorCallee = CGF.performAddrSpaceCast(dtorCallee, AddrPtrTy);
 
   if (!addr)
     // addr is null when we are trying to register a dtor annotated with
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index 06fce6171eb28..d959b89f860e4 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -4107,7 +4107,7 @@ void MicrosoftCXXABI::emitCXXStructor(GlobalDecl GD) {
     return;
 
   if (GD.getDtorType() == Dtor_VectorDeleting &&
-      !getContext().classNeedsVectorDeletingDestructor(dtor->getParent())) {
+      !CGM.classNeedsVectorDestructor(dtor->getParent())) {
     // Create GlobalDecl object with the correct type for the scalar
     // deleting destructor.
     GlobalDecl ScalarDtorGD(dtor, Dtor_Deleting);
diff --git a/clang/lib/Driver/CMakeLists.txt b/clang/lib/Driver/CMakeLists.txt
index f5440a01a5932..e3eff21986181 100644
--- a/clang/lib/Driver/CMakeLists.txt
+++ b/clang/lib/Driver/CMakeLists.txt
@@ -14,6 +14,10 @@ if(WIN32)
   set(system_libs version)
 endif()
 
+if(CLANG_USE_XCSELECT)
+  set(system_libs xcselect)
+endif()
+
 add_clang_library(clangDriver
   Action.cpp
   Compilation.cpp
@@ -101,6 +105,8 @@ add_clang_library(clangDriver
   LINK_LIBS
   clangBasic
   clangFrontend
+  clangScalableStaticAnalysisFrameworkCore
+  clangScalableStaticAnalysisFrameworkFrontend
   clangSerialization
   clangLex
   clangOptions
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index b7f65b7b74401..c559855ac91f5 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -69,6 +69,9 @@
 #include "clang/Driver/Types.h"
 #include "clang/Options/OptionUtils.h"
 #include "clang/Options/Options.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h"
+#include "clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h" // IWYU pragma: keep
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
@@ -2555,6 +2558,22 @@ bool Driver::HandleImmediateArgs(Compilation &C) {
     return false;
   }
 
+  // Honor --ssaf-list-extractors, --ssaf-list-formats and their combinations.
+  bool ListExtractors = C.getArgs().hasArg(options::OPT__ssaf_list_extractors);
+  bool ListFormats = C.getArgs().hasArg(options::OPT__ssaf_list_formats);
+  if (ListExtractors || ListFormats) {
+    if (ListExtractors)
+      ssaf::printAvailableTUSummaryExtractors(llvm::outs());
+    if (ListFormats)
+      ssaf::printAvailableFormats(llvm::outs());
+    return false;
+  }
+
+  if (C.getArgs().hasArg(options::OPT__ssaf_list_formats)) {
+    ssaf::printAvailableFormats(llvm::outs());
+    return false;
+  }
+
   if (C.getArgs().hasArg(options::OPT_v) ||
       C.getArgs().hasArg(options::OPT__HASH_HASH_HASH) ||
       C.getArgs().hasArg(options::OPT_print_supported_cpus) ||
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index ce1e971110caf..78010b9e659d8 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -108,6 +108,7 @@ enum CoverageFeature {
   CoverageTraceLoads = 1 << 16,
   CoverageTraceStores = 1 << 17,
   CoverageControlFlow = 1 << 18,
+  CoverageTracePCEntryExit = 1 << 19,
 };
 
 enum BinaryMetadataFeature {
@@ -963,10 +964,10 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
   }
 
   int InsertionPointTypes = CoverageFunc | CoverageBB | CoverageEdge;
-  int InstrumentationTypes = CoverageTracePC | CoverageTracePCGuard |
-                             CoverageInline8bitCounters | CoverageTraceLoads |
-                             CoverageTraceStores | CoverageInlineBoolFlag |
-                             CoverageControlFlow;
+  int InstrumentationTypes = CoverageTracePC | CoverageTracePCEntryExit |
+                             CoverageTracePCGuard | CoverageInline8bitCounters |
+                             CoverageTraceLoads | CoverageTraceStores |
+                             CoverageInlineBoolFlag | CoverageControlFlow;
   if ((CoverageFeatures & InsertionPointTypes) &&
       !(CoverageFeatures & InstrumentationTypes) && DiagnoseErrors) {
     D.Diag(clang::diag::warn_drv_deprecated_arg)
@@ -977,9 +978,9 @@ SanitizerArgs::SanitizerArgs(const ToolChain &TC,
 
   // trace-pc w/o func/bb/edge implies edge.
   if (!(CoverageFeatures & InsertionPointTypes)) {
-    if (CoverageFeatures &
-        (CoverageTracePC | CoverageTracePCGuard | CoverageInline8bitCounters |
-         CoverageInlineBoolFlag | CoverageControlFlow))
+    if (CoverageFeatures & (CoverageTracePC | CoverageTracePCEntryExit |
+                            CoverageTracePCGuard | CoverageInline8bitCounters |
+                            CoverageInlineBoolFlag | CoverageControlFlow))
       CoverageFeatures |= CoverageEdge;
 
     if (CoverageFeatures & CoverageStackDepth)
@@ -1327,6 +1328,8 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
       std::make_pair(CoverageTraceGep, "-fsanitize-coverage-trace-gep"),
       std::make_pair(Coverage8bitCounters, "-fsanitize-coverage-8bit-counters"),
       std::make_pair(CoverageTracePC, "-fsanitize-coverage-trace-pc"),
+      std::make_pair(CoverageTracePCEntryExit,
+                     "-fsanitize-coverage-trace-pc-entry-exit"),
       std::make_pair(CoverageTracePCGuard,
                      "-fsanitize-coverage-trace-pc-guard"),
       std::make_pair(CoverageInline8bitCounters,
@@ -1716,6 +1719,7 @@ int parseCoverageFeatures(const Driver &D, const llvm::opt::Arg *A,
                 .Case("trace-gep", CoverageTraceGep)
                 .Case("8bit-counters", Coverage8bitCounters)
                 .Case("trace-pc", CoverageTracePC)
+                .Case("trace-pc-entry-exit", CoverageTracePCEntryExit)
                 .Case("trace-pc-guard", CoverageTracePCGuard)
                 .Case("no-prune", CoverageNoPrune)
                 .Case("inline-8bit-counters", CoverageInline8bitCounters)
diff --git a/clang/lib/Driver/ToolChains/Arch/X86.cpp b/clang/lib/Driver/ToolChains/Arch/X86.cpp
index c113da6733370..5891b56999420 100644
--- a/clang/lib/Driver/ToolChains/Arch/X86.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/X86.cpp
@@ -265,13 +265,13 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
       if (IsNegative) {
         EGPROpt = EGPRFeature::Disabled;
         Features.insert(Features.end(),
-                        {"-egpr", "-ndd", "-ccmp", "-nf", "-zu"});
+                        {"-egpr", "-ndd", "-ccmp", "-nf", "-zu", "-jmpabs"});
         if (!Triple.isOSWindows())
           Features.insert(Features.end(), {"-push2pop2", "-ppx"});
       } else {
         EGPROpt = EGPRFeature::Enabled;
         Features.insert(Features.end(),
-                        {"+egpr", "+ndd", "+ccmp", "+nf", "+zu"});
+                        {"+egpr", "+ndd", "+ccmp", "+nf", "+zu", "+jmpabs"});
         if (!Triple.isOSWindows())
           Features.insert(Features.end(), {"+push2pop2", "+ppx"});
 
@@ -291,7 +291,7 @@ void x86::getX86TargetFeatures(const Driver &D, const llvm::Triple &Triple,
       for (StringRef Value : A->getValues()) {
         if (Value != "egpr" && Value != "push2pop2" && Value != "ppx" &&
             Value != "ndd" && Value != "ccmp" && Value != "nf" &&
-            Value != "cf" && Value != "zu")
+            Value != "cf" && Value != "zu" && Value != "jmpabs")
           D.Diag(clang::diag::err_drv_unsupported_option_argument)
               << A->getSpelling() << Value;
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c5b75bdb2511f..3b852528d92c4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7707,6 +7707,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   Args.AddLastArg(CmdArgs, options::OPT_fmax_tokens_EQ);
 
+  Args.AddLastArg(CmdArgs, options::OPT__ssaf_extract_summaries);
+  Args.AddLastArg(CmdArgs, options::OPT__ssaf_tu_summary_file);
+
   // Handle serialized diagnostics.
   if (Arg *A = Args.getLastArg(options::OPT__serialize_diags)) {
     CmdArgs.push_back("-serialize-diagnostic-file");
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 86810e1f28f36..7251f4a92d92d 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -27,6 +27,10 @@
 #include "llvm/TargetParser/Triple.h"
 #include <cstdlib> // ::getenv
 
+#ifdef CLANG_USE_XCSELECT
+#include <xcselect.h> // ::xcselect_host_sdk_path
+#endif
+
 using namespace clang::driver;
 using namespace clang::driver::tools;
 using namespace clang::driver::toolchains;
@@ -2488,17 +2492,27 @@ void Darwin::AddDeploymentTarget(DerivedArgList &Args) const {
     // Warn if the path does not exist.
     if (!getVFS().exists(A->getValue()))
       getDriver().Diag(clang::diag::warn_missing_sysroot) << A->getValue();
-  } else {
-    if (char *env = ::getenv("SDKROOT")) {
-      // We only use this value as the default if it is an absolute path,
-      // exists, and it is not the root path.
-      if (llvm::sys::path::is_absolute(env) && getVFS().exists(env) &&
-          StringRef(env) != "/") {
-        Args.append(Args.MakeSeparateArg(
-            nullptr, Opts.getOption(options::OPT_isysroot), env));
-      }
+  } else if (const char *env = ::getenv("SDKROOT")) {
+    // We only use this value as the default if it is an absolute path,
+    // exists, and it is not the root path.
+    if (llvm::sys::path::is_absolute(env) && getVFS().exists(env) &&
+        StringRef(env) != "/") {
+      Args.append(Args.MakeSeparateArg(
+          nullptr, Opts.getOption(options::OPT_isysroot), env));
     }
   }
+#ifdef CLANG_USE_XCSELECT
+  // FIXME: This should check for `getTriple().isMacOSX()`, but this breaks
+  // many tests. See https://github.com/llvm/llvm-project/pull/119670.
+  else if (getTriple().getOS() == llvm::Triple::MacOSX) {
+    char *p;
+    if (!::xcselect_host_sdk_path(CLANG_XCSELECT_HOST_SDK_POLICY, &p)) {
+      Args.append(Args.MakeSeparateArg(
+          nullptr, Opts.getOption(options::OPT_isysroot), p));
+      ::free(p);
+    }
+  }
+#endif
 
   // Read the SDKSettings.json file for more information, like the SDK version
   // that we can pass down to the compiler.
diff --git a/clang/lib/Format/BreakableToken.cpp b/clang/lib/Format/BreakableToken.cpp
index 994a427517ffc..b60daffc0eb1c 100644
--- a/clang/lib/Format/BreakableToken.cpp
+++ b/clang/lib/Format/BreakableToken.cpp
@@ -1042,38 +1042,40 @@ BreakableComment::Split BreakableLineCommentSection::getReflowSplit(
 
 void BreakableLineCommentSection::reflow(unsigned LineIndex,
                                          WhitespaceManager &Whitespaces) const {
-  if (LineIndex > 0 && Tokens[LineIndex] != Tokens[LineIndex - 1]) {
-    // Reflow happens between tokens. Replace the whitespace between the
-    // tokens by the empty string.
-    Whitespaces.replaceWhitespace(
-        *Tokens[LineIndex], /*Newlines=*/0, /*Spaces=*/0,
-        /*StartOfTokenColumn=*/StartColumn, /*IsAligned=*/true,
-        /*InPPDirective=*/false);
-  } else if (LineIndex > 0) {
-    // In case we're reflowing after the '\' in:
-    //
-    //   // line comment \
-    //   // line 2
-    //
-    // the reflow happens inside the single comment token (it is a single line
-    // comment with an unescaped newline).
-    // Replace the whitespace between the '\' and '//' with the empty string.
-    //
-    // Offset points to after the '\' relative to start of the token.
-    unsigned Offset = Lines[LineIndex - 1].data() +
-                      Lines[LineIndex - 1].size() -
-                      tokenAt(LineIndex - 1).TokenText.data();
-    // WhitespaceLength is the number of chars between the '\' and the '//' on
-    // the next line.
-    unsigned WhitespaceLength =
-        Lines[LineIndex].data() - tokenAt(LineIndex).TokenText.data() - Offset;
-    Whitespaces.replaceWhitespaceInToken(*Tokens[LineIndex], Offset,
-                                         /*ReplaceChars=*/WhitespaceLength,
-                                         /*PreviousPostfix=*/"",
-                                         /*CurrentPrefix=*/"",
-                                         /*InPPDirective=*/false,
-                                         /*Newlines=*/0,
-                                         /*Spaces=*/0);
+  if (LineIndex > 0) {
+    if (Tokens[LineIndex] != Tokens[LineIndex - 1]) {
+      // Reflow happens between tokens. Replace the whitespace between the
+      // tokens by the empty string.
+      Whitespaces.replaceWhitespace(
+          *Tokens[LineIndex], /*Newlines=*/0, /*Spaces=*/0,
+          /*StartOfTokenColumn=*/StartColumn, /*IsAligned=*/true,
+          /*InPPDirective=*/false);
+    } else {
+      // In case we're reflowing after the '\' in:
+      //
+      //   // line comment \
+      //   // line 2
+      //
+      // the reflow happens inside the single comment token (it is a single line
+      // comment with an unescaped newline).
+      // Replace the whitespace between the '\' and '//' with the empty string.
+      //
+      // Offset points to after the '\' relative to start of the token.
+      unsigned Offset = Lines[LineIndex - 1].data() +
+                        Lines[LineIndex - 1].size() -
+                        tokenAt(LineIndex - 1).TokenText.data();
+      // WhitespaceLength is the number of chars between the '\' and the '//' on
+      // the next line.
+      unsigned WhitespaceLength = Lines[LineIndex].data() -
+                                  tokenAt(LineIndex).TokenText.data() - Offset;
+      Whitespaces.replaceWhitespaceInToken(*Tokens[LineIndex], Offset,
+                                           /*ReplaceChars=*/WhitespaceLength,
+                                           /*PreviousPostfix=*/"",
+                                           /*CurrentPrefix=*/"",
+                                           /*InPPDirective=*/false,
+                                           /*Newlines=*/0,
+                                           /*Spaces=*/0);
+    }
   }
   // Replace the indent and prefix of the token with the reflow prefix.
   unsigned Offset =
@@ -1116,7 +1118,7 @@ void BreakableLineCommentSection::adaptStartOfLine(
                                   /*Newlines=*/1,
                                   /*Spaces=*/LineColumn,
                                   /*StartOfTokenColumn=*/LineColumn,
-                                  /*IsAligned=*/true,
+                                  /*IsAligned=*/tokenAt(0).NewlinesBefore == 0,
                                   /*InPPDirective=*/false);
   }
   if (OriginalPrefix[LineIndex] != Prefix[LineIndex]) {
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 1eb17592a89e6..a388b74920e0b 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -710,7 +710,8 @@ bool ContinuationIndenter::mustBreak(const LineState &State) {
         return false;
       if (Tok->is(TT_TemplateCloser)) {
         Tok = Tok->MatchingParen;
-        assert(Tok);
+        if (!Tok)
+          return false;
       }
       if (Tok->FirstAfterPPLine)
         return false;
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 47be33299eadb..148124b190f4a 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -731,15 +731,26 @@ template <> struct ScalarEnumerationTraits<FormatStyle::ShortBlockStyle> {
   }
 };
 
-template <> struct ScalarEnumerationTraits<FormatStyle::ShortFunctionStyle> {
-  static void enumeration(IO &IO, FormatStyle::ShortFunctionStyle &Value) {
-    IO.enumCase(Value, "None", FormatStyle::SFS_None);
-    IO.enumCase(Value, "false", FormatStyle::SFS_None);
-    IO.enumCase(Value, "All", FormatStyle::SFS_All);
-    IO.enumCase(Value, "true", FormatStyle::SFS_All);
-    IO.enumCase(Value, "Inline", FormatStyle::SFS_Inline);
-    IO.enumCase(Value, "InlineOnly", FormatStyle::SFS_InlineOnly);
-    IO.enumCase(Value, "Empty", FormatStyle::SFS_Empty);
+template <> struct MappingTraits<FormatStyle::ShortFunctionStyle> {
+  static void enumInput(IO &IO, FormatStyle::ShortFunctionStyle &Value) {
+    IO.enumCase(Value, "None", FormatStyle::ShortFunctionStyle());
+    IO.enumCase(Value, "Empty",
+                FormatStyle::ShortFunctionStyle::setEmptyOnly());
+    IO.enumCase(Value, "Inline",
+                FormatStyle::ShortFunctionStyle::setEmptyAndInline());
+    IO.enumCase(Value, "InlineOnly",
+                FormatStyle::ShortFunctionStyle::setInlineOnly());
+    IO.enumCase(Value, "All", FormatStyle::ShortFunctionStyle::setAll());
+
+    // For backward compatibility.
+    IO.enumCase(Value, "true", FormatStyle::ShortFunctionStyle::setAll());
+    IO.enumCase(Value, "false", FormatStyle::ShortFunctionStyle());
+  }
+
+  static void mapping(IO &IO, FormatStyle::ShortFunctionStyle &Value) {
+    IO.mapOptional("Empty", Value.Empty);
+    IO.mapOptional("Inline", Value.Inline);
+    IO.mapOptional("Other", Value.Other);
   }
 };
 
@@ -768,6 +779,15 @@ template <> struct ScalarEnumerationTraits<FormatStyle::ShortLambdaStyle> {
   }
 };
 
+template <> struct ScalarEnumerationTraits<FormatStyle::ShortRecordStyle> {
+  static void enumeration(IO &IO, FormatStyle::ShortRecordStyle &Value) {
+    IO.enumCase(Value, "Never", FormatStyle::SRS_Never);
+    IO.enumCase(Value, "EmptyAndAttached", FormatStyle::SRS_EmptyAndAttached);
+    IO.enumCase(Value, "Empty", FormatStyle::SRS_Empty);
+    IO.enumCase(Value, "Always", FormatStyle::SRS_Always);
+  }
+};
+
 template <> struct MappingTraits<FormatStyle::SortIncludesOptions> {
   static void enumInput(IO &IO, FormatStyle::SortIncludesOptions &Value) {
     IO.enumCase(Value, "Never", FormatStyle::SortIncludesOptions({}));
@@ -1185,6 +1205,8 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.AllowShortLoopsOnASingleLine);
     IO.mapOptional("AllowShortNamespacesOnASingleLine",
                    Style.AllowShortNamespacesOnASingleLine);
+    IO.mapOptional("AllowShortRecordOnASingleLine",
+                   Style.AllowShortRecordOnASingleLine);
     IO.mapOptional("AlwaysBreakAfterDefinitionReturnType",
                    Style.AlwaysBreakAfterDefinitionReturnType);
     IO.mapOptional("AlwaysBreakBeforeMultilineStrings",
@@ -1746,11 +1768,13 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.AllowShortCaseLabelsOnASingleLine = false;
   LLVMStyle.AllowShortCompoundRequirementOnASingleLine = true;
   LLVMStyle.AllowShortEnumsOnASingleLine = true;
-  LLVMStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  LLVMStyle.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   LLVMStyle.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never;
   LLVMStyle.AllowShortLambdasOnASingleLine = FormatStyle::SLS_All;
   LLVMStyle.AllowShortLoopsOnASingleLine = false;
   LLVMStyle.AllowShortNamespacesOnASingleLine = false;
+  LLVMStyle.AllowShortRecordOnASingleLine = FormatStyle::SRS_EmptyAndAttached;
   LLVMStyle.AlwaysBreakAfterDefinitionReturnType = FormatStyle::DRTBS_None;
   LLVMStyle.AlwaysBreakBeforeMultilineStrings = false;
   LLVMStyle.AttributeMacros.push_back("__capability");
@@ -1987,11 +2011,6 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
   GoogleStyle.IncludeStyle.IncludeIsMainRegex = "([-_](test|unittest))?$";
   GoogleStyle.IndentCaseLabels = true;
   GoogleStyle.KeepEmptyLines.AtStartOfBlock = false;
-
-  GoogleStyle.Macros.push_back("ASSIGN_OR_RETURN(a, b)=a = (b)");
-  GoogleStyle.Macros.push_back(
-      "ASSIGN_OR_RETURN(a, b, c)=a = (b); if (x) return c");
-
   GoogleStyle.ObjCBinPackProtocolList = FormatStyle::BPS_Never;
   GoogleStyle.ObjCSpaceAfterProperty = false;
   GoogleStyle.ObjCSpaceBeforeProtocolList = true;
@@ -2052,7 +2071,8 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
     GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
     GoogleStyle.AlignTrailingComments = {};
     GoogleStyle.AlignTrailingComments.Kind = FormatStyle::TCAS_Never;
-    GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
+    GoogleStyle.AllowShortFunctionsOnASingleLine =
+        FormatStyle::ShortFunctionStyle::setEmptyOnly();
     GoogleStyle.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never;
     GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
     GoogleStyle.BreakBeforeBinaryOperators = FormatStyle::BOS_NonAssignment;
@@ -2064,7 +2084,8 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
     GoogleStyle.BreakAfterOpenBracketFunction = true;
     GoogleStyle.BreakAfterOpenBracketIf = true;
     GoogleStyle.AlignOperands = FormatStyle::OAS_DontAlign;
-    GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
+    GoogleStyle.AllowShortFunctionsOnASingleLine =
+        FormatStyle::ShortFunctionStyle::setEmptyOnly();
     // TODO: still under discussion whether to switch to SLS_All.
     GoogleStyle.AllowShortLambdasOnASingleLine = FormatStyle::SLS_Empty;
     GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
@@ -2081,7 +2102,8 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
     GoogleStyle.NamespaceIndentation = FormatStyle::NI_All;
     GoogleStyle.SpacesInContainerLiterals = false;
   } else if (Language == FormatStyle::LK_Proto) {
-    GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
+    GoogleStyle.AllowShortFunctionsOnASingleLine =
+        FormatStyle::ShortFunctionStyle::setEmptyOnly();
     GoogleStyle.AlwaysBreakBeforeMultilineStrings = false;
     // This affects protocol buffer options specifications and text protos.
     // Text protos are currently mostly formatted inside C++ raw string literals
@@ -2101,7 +2123,8 @@ FormatStyle getGoogleStyle(FormatStyle::LanguageKind Language) {
     GoogleStyle.IncludeStyle.IncludeBlocks =
         tooling::IncludeStyle::IBS_Preserve;
   } else if (Language == FormatStyle::LK_CSharp) {
-    GoogleStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
+    GoogleStyle.AllowShortFunctionsOnASingleLine =
+        FormatStyle::ShortFunctionStyle::setEmptyOnly();
     GoogleStyle.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never;
     GoogleStyle.BreakStringLiterals = false;
     GoogleStyle.ColumnLimit = 100;
@@ -2159,7 +2182,8 @@ FormatStyle getChromiumStyle(FormatStyle::LanguageKind Language) {
     ChromiumStyle.AllowShortLoopsOnASingleLine = false;
   } else {
     ChromiumStyle.AllowAllParametersOfDeclarationOnNextLine = false;
-    ChromiumStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+    ChromiumStyle.AllowShortFunctionsOnASingleLine =
+        FormatStyle::ShortFunctionStyle::setEmptyAndInline();
     ChromiumStyle.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never;
     ChromiumStyle.AllowShortLoopsOnASingleLine = false;
     ChromiumStyle.BinPackParameters = FormatStyle::BPPS_OnePerLine;
@@ -2173,7 +2197,8 @@ FormatStyle getChromiumStyle(FormatStyle::LanguageKind Language) {
 FormatStyle getMozillaStyle() {
   FormatStyle MozillaStyle = getLLVMStyle();
   MozillaStyle.AllowAllParametersOfDeclarationOnNextLine = false;
-  MozillaStyle.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+  MozillaStyle.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   MozillaStyle.AlwaysBreakAfterDefinitionReturnType =
       FormatStyle::DRTBS_TopLevel;
   MozillaStyle.BinPackArguments = false;
@@ -2255,7 +2280,7 @@ FormatStyle getMicrosoftStyle(FormatStyle::LanguageKind Language) {
   Style.BraceWrapping.BeforeWhile = false;
   Style.PenaltyReturnTypeOnItsOwnLine = 1000;
   Style.AllowShortEnumsOnASingleLine = false;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   Style.AllowShortCaseLabelsOnASingleLine = false;
   Style.AllowShortIfStatementsOnASingleLine = FormatStyle::SIS_Never;
   Style.AllowShortLoopsOnASingleLine = false;
@@ -3825,8 +3850,10 @@ static void sortJavaImports(const FormatStyle &Style,
 
 namespace {
 
-const char JavaImportRegexPattern[] =
-    "^[\t ]*import[\t ]+(static[\t ]*)?([^\t ]*)[\t ]*;";
+constexpr StringRef
+    JavaImportRegexPattern("^import[\t ]+(static[\t ]*)?([^\t ]*)[\t ]*;");
+
+constexpr StringRef JavaPackageRegexPattern("^package[\t ]");
 
 } // anonymous namespace
 
@@ -3835,26 +3862,43 @@ tooling::Replacements sortJavaImports(const FormatStyle &Style, StringRef Code,
                                       StringRef FileName,
                                       tooling::Replacements &Replaces) {
   unsigned Prev = 0;
-  unsigned SearchFrom = 0;
+  bool HasImport = false;
   llvm::Regex ImportRegex(JavaImportRegexPattern);
+  llvm::Regex PackageRegex(JavaPackageRegexPattern);
   SmallVector<StringRef, 4> Matches;
   SmallVector<JavaImportDirective, 16> ImportsInBlock;
   SmallVector<StringRef> AssociatedCommentLines;
 
-  bool FormattingOff = false;
-
-  for (;;) {
-    auto Pos = Code.find('\n', SearchFrom);
-    StringRef Line =
-        Code.substr(Prev, (Pos != StringRef::npos ? Pos : Code.size()) - Prev);
+  for (bool FormattingOff = false;;) {
+    auto Pos = Code.find('\n', Prev);
+    auto GetLine = [&] {
+      return Code.substr(Prev,
+                         (Pos != StringRef::npos ? Pos : Code.size()) - Prev);
+    };
+    StringRef Line = GetLine();
 
     StringRef Trimmed = Line.trim();
-    if (isClangFormatOff(Trimmed))
+    if (Trimmed.empty() || PackageRegex.match(Trimmed)) {
+      // Skip empty line and package statement.
+    } else if (isClangFormatOff(Trimmed)) {
       FormattingOff = true;
-    else if (isClangFormatOn(Trimmed))
+    } else if (isClangFormatOn(Trimmed)) {
       FormattingOff = false;
-
-    if (ImportRegex.match(Line, &Matches)) {
+    } else if (Trimmed.starts_with("//")) {
+      // Associating comments within the imports with the nearest import below.
+      if (HasImport)
+        AssociatedCommentLines.push_back(Line);
+    } else if (Trimmed.starts_with("/*")) {
+      Pos = Code.find("*/", Pos + 2);
+      if (Pos != StringRef::npos)
+        Pos = Code.find('\n', Pos + 2);
+      if (HasImport) {
+        // Extend `Line` for a multiline comment to include all lines the
+        // comment spans.
+        Line = GetLine();
+        AssociatedCommentLines.push_back(Line);
+      }
+    } else if (ImportRegex.match(Trimmed, &Matches)) {
       if (FormattingOff) {
         // If at least one import line has formatting turned off, turn off
         // formatting entirely.
@@ -3867,17 +3911,18 @@ tooling::Replacements sortJavaImports(const FormatStyle &Style, StringRef Code,
         IsStatic = true;
       ImportsInBlock.push_back(
           {Identifier, Line, Prev, AssociatedCommentLines, IsStatic});
+      HasImport = true;
       AssociatedCommentLines.clear();
-    } else if (!Trimmed.empty() && !ImportsInBlock.empty()) {
-      // Associating comments within the imports with the nearest import below
-      AssociatedCommentLines.push_back(Line);
+    } else {
+      // `Trimmed` is neither empty, nor a comment or a package/import
+      // statement.
+      break;
     }
-    Prev = Pos + 1;
     if (Pos == StringRef::npos || Pos + 1 == Code.size())
       break;
-    SearchFrom = Pos + 1;
+    Prev = Pos + 1;
   }
-  if (!ImportsInBlock.empty())
+  if (HasImport)
     sortJavaImports(Style, ImportsInBlock, Ranges, FileName, Code, Replaces);
   return Replaces;
 }
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 965c9195aa7f4..c32822ce90d1f 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1246,13 +1246,18 @@ class AnnotatingParser {
           OpeningBrace.overwriteFixedType(TT_DictLiteral);
         }
       }
+      bool IsBracedListComma = false;
       if (CurrentToken->is(tok::comma)) {
         if (Style.isJavaScript())
           OpeningBrace.overwriteFixedType(TT_DictLiteral);
+        else
+          IsBracedListComma = OpeningBrace.is(BK_BracedInit);
         ++CommaCount;
       }
       if (!consumeToken())
         return false;
+      if (IsBracedListComma)
+        Contexts.back().IsExpression = true;
     }
     return true;
   }
@@ -3046,7 +3051,7 @@ class AnnotatingParser {
       return TT_BinaryOperator;
 
     if (NextToken->isOneOf(tok::arrow, tok::equal, tok::comma, tok::r_paren,
-                           TT_RequiresClause) ||
+                           tok::semi, TT_RequiresClause) ||
         (NextToken->is(tok::kw_noexcept) && !IsExpression) ||
         NextToken->canBePointerOrReferenceQualifier() ||
         (NextToken->is(tok::l_brace) && !NextToken->getNextNonComment())) {
@@ -3066,8 +3071,6 @@ class AnnotatingParser {
       return TT_PointerOrReference;
     if (NextToken->is(tok::kw_operator) && !IsExpression)
       return TT_PointerOrReference;
-    if (NextToken->isOneOf(tok::comma, tok::semi))
-      return TT_PointerOrReference;
 
     // After right braces, star tokens are likely to be pointers to struct,
     // union, or class.
@@ -5803,11 +5806,10 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
     if (Right.is(tok::r_brace) && Left.is(tok::l_brace) &&
         !Left.Children.empty()) {
       // Support AllowShortFunctionsOnASingleLine for JavaScript.
-      return Style.AllowShortFunctionsOnASingleLine == FormatStyle::SFS_None ||
-             Style.AllowShortFunctionsOnASingleLine == FormatStyle::SFS_Empty ||
-             (Left.NestingLevel == 0 && Line.Level == 0 &&
-              Style.AllowShortFunctionsOnASingleLine &
-                  FormatStyle::SFS_InlineOnly);
+      if (Left.NestingLevel == 0 && Line.Level == 0)
+        return !Style.AllowShortFunctionsOnASingleLine.Other;
+
+      return !Style.AllowShortFunctionsOnASingleLine.Inline;
     }
   } else if (Style.isJava()) {
     if (Right.is(tok::plus) && Left.is(tok::string_literal) && AfterRight &&
@@ -6044,12 +6046,15 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
       return true;
     }
 
-    // Don't attempt to interpret struct return types as structs.
+    // Don't attempt to interpret record return types as records.
     if (Right.isNot(TT_FunctionLBrace)) {
-      return (Line.startsWith(tok::kw_class) &&
-              Style.BraceWrapping.AfterClass) ||
-             (Line.startsWith(tok::kw_struct) &&
-              Style.BraceWrapping.AfterStruct);
+      return Style.AllowShortRecordOnASingleLine == FormatStyle::SRS_Never &&
+             ((Line.startsWith(tok::kw_class) &&
+               Style.BraceWrapping.AfterClass) ||
+              (Line.startsWith(tok::kw_struct) &&
+               Style.BraceWrapping.AfterStruct) ||
+              (Line.startsWith(tok::kw_union) &&
+               Style.BraceWrapping.AfterUnion));
     }
   }
 
diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp
index 74c0f4bf75721..ef0e3123675cc 100644
--- a/clang/lib/Format/UnwrappedLineFormatter.cpp
+++ b/clang/lib/Format/UnwrappedLineFormatter.cpp
@@ -273,15 +273,22 @@ class LineJoiner {
       }
     }
 
+    // Try merging record blocks that have had their left brace wrapped into
+    // a single line.
+    if (NextLine.First->isOneOf(TT_ClassLBrace, TT_StructLBrace,
+                                TT_UnionLBrace)) {
+      if (unsigned MergedLines = tryMergeRecord(I, E, Limit))
+        return MergedLines;
+    }
+
     const auto *PreviousLine = I != AnnotatedLines.begin() ? I[-1] : nullptr;
-    // Handle empty record blocks where the brace has already been wrapped.
+
+    // Handle blocks where the brace has already been wrapped.
     if (PreviousLine && TheLine->Last->is(tok::l_brace) &&
         TheLine->First == TheLine->Last) {
-      bool EmptyBlock = NextLine.First->is(tok::r_brace);
+      const bool EmptyBlock = NextLine.First->is(tok::r_brace);
 
-      const FormatToken *Tok = PreviousLine->First;
-      if (Tok && Tok->is(tok::comment))
-        Tok = Tok->getNextNonComment();
+      const FormatToken *Tok = PreviousLine->getFirstNonComment();
 
       if (Tok && Tok->getNamespaceToken()) {
         return !Style.BraceWrapping.SplitEmptyNamespace && EmptyBlock
@@ -291,9 +298,11 @@ class LineJoiner {
 
       if (Tok && Tok->is(tok::kw_typedef))
         Tok = Tok->getNextNonComment();
-      if (Tok && Tok->isOneOf(tok::kw_class, tok::kw_struct, tok::kw_union,
-                              tok::kw_extern, Keywords.kw_interface,
-                              Keywords.kw_record)) {
+
+      if (Tok && Tok->isOneOf(tok::kw_class, tok::kw_struct, tok::kw_union))
+        return tryMergeRecord(I, E, Limit);
+
+      if (Tok && Tok->isOneOf(tok::kw_extern, Keywords.kw_interface)) {
         return !Style.BraceWrapping.SplitEmptyRecord && EmptyBlock
                    ? tryMergeSimpleBlock(I, E, Limit)
                    : 0;
@@ -307,15 +316,16 @@ class LineJoiner {
 
     auto ShouldMergeShortFunctions = [this, &I, &NextLine, PreviousLine,
                                       TheLine]() {
-      if (Style.AllowShortFunctionsOnASingleLine == FormatStyle::SFS_All)
+      if (Style.AllowShortFunctionsOnASingleLine.isAll())
         return true;
-      if (Style.AllowShortFunctionsOnASingleLine >= FormatStyle::SFS_Empty &&
+
+      if (Style.AllowShortFunctionsOnASingleLine.Empty &&
           NextLine.First->is(tok::r_brace)) {
         return true;
       }
 
-      if (Style.AllowShortFunctionsOnASingleLine &
-          FormatStyle::SFS_InlineOnly) {
+      if (Style.AllowShortFunctionsOnASingleLine.Inline &&
+          !Style.AllowShortFunctionsOnASingleLine.Other) {
         // Just checking TheLine->Level != 0 is not enough, because it
         // provokes treating functions inside indented namespaces as short.
         if (Style.isJavaScript() && TheLine->Last->is(TT_FunctionLBrace))
@@ -485,7 +495,8 @@ class LineJoiner {
             const FormatToken *PreviousPrevious =
                 Previous->getPreviousNonComment();
             if (PreviousPrevious &&
-                PreviousPrevious->isOneOf(tok::kw_class, tok::kw_struct)) {
+                PreviousPrevious->isOneOf(tok::kw_class, tok::kw_struct,
+                                          tok::kw_union)) {
               return 0;
             }
           }
@@ -507,16 +518,12 @@ class LineJoiner {
       } else if (TheLine->Last->is(TT_CompoundRequirementLBrace)) {
         ShouldMerge = Style.AllowShortCompoundRequirementOnASingleLine;
       } else if (TheLine->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace,
-                                        TT_RecordLBrace)) {
-        // NOTE: We use AfterClass (whereas AfterStruct exists) for both classes
-        // and structs, but it seems that wrapping is still handled correctly
-        // elsewhere.
-        ShouldMerge = !Style.BraceWrapping.AfterClass ||
-                      (NextLine.First->is(tok::r_brace) &&
-                       !Style.BraceWrapping.SplitEmptyRecord);
+                                        TT_UnionLBrace) ||
+                 (TheLine->Last->is(TT_RecordLBrace) && Style.isJava())) {
+        return tryMergeRecord(I, E, Limit);
       } else if (TheLine->InPPDirective ||
                  TheLine->First->isNoneOf(tok::kw_class, tok::kw_enum,
-                                          tok::kw_struct, Keywords.kw_record)) {
+                                          tok::kw_struct, tok::kw_union)) {
         // Try to merge a block with left brace unwrapped that wasn't yet
         // covered.
         ShouldMerge = !Style.BraceWrapping.AfterFunction ||
@@ -539,7 +546,7 @@ class LineJoiner {
 
       unsigned MergedLines = 0;
       if (MergeShortFunctions ||
-          (Style.AllowShortFunctionsOnASingleLine >= FormatStyle::SFS_Empty &&
+          (Style.AllowShortFunctionsOnASingleLine.Empty &&
            NextLine.First == NextLine.Last && I + 2 != E &&
            I[2]->First->is(tok::r_brace))) {
         MergedLines = tryMergeSimpleBlock(I + 1, E, Limit);
@@ -583,6 +590,73 @@ class LineJoiner {
     return 0;
   }
 
+  unsigned tryMergeRecord(ArrayRef<AnnotatedLine *>::const_iterator I,
+                          ArrayRef<AnnotatedLine *>::const_iterator E,
+                          unsigned Limit) {
+    const auto *Line = I[0];
+    const auto *NextLine = I[1];
+
+    // Current line begins both record and block, brace was not wrapped.
+    if (Line->Last->isOneOf(TT_ClassLBrace, TT_StructLBrace, TT_UnionLBrace)) {
+      auto ShouldWrapLBrace = [&](TokenType LBraceType) {
+        switch (LBraceType) {
+        case TT_ClassLBrace:
+          return Style.BraceWrapping.AfterClass;
+        case TT_StructLBrace:
+          return Style.BraceWrapping.AfterStruct;
+        case TT_UnionLBrace:
+          return Style.BraceWrapping.AfterUnion;
+        default:
+          return false;
+        }
+      };
+
+      auto TryMergeShortRecord = [&] {
+        switch (Style.AllowShortRecordOnASingleLine) {
+        case FormatStyle::SRS_Never:
+          return false;
+        case FormatStyle::SRS_Always:
+          return true;
+        default:
+          return NextLine->First->is(tok::r_brace);
+        }
+      };
+
+      if (Style.AllowShortRecordOnASingleLine != FormatStyle::SRS_Never &&
+          (!ShouldWrapLBrace(Line->Last->getType()) ||
+           (!Style.BraceWrapping.SplitEmptyRecord && TryMergeShortRecord()))) {
+        return tryMergeSimpleBlock(I, E, Limit);
+      }
+    }
+
+    // Cases where the l_brace was wrapped.
+    // Current line begins record, next line block.
+    if (NextLine->First->isOneOf(TT_ClassLBrace, TT_StructLBrace,
+                                 TT_UnionLBrace)) {
+      if (I + 2 == E || I[2]->First->is(tok::r_brace) ||
+          Style.AllowShortRecordOnASingleLine != FormatStyle::SRS_Always) {
+        return 0;
+      }
+
+      return tryMergeSimpleBlock(I, E, Limit);
+    }
+
+    // Previous line begins record, current line block.
+    if (I != AnnotatedLines.begin() &&
+        I[-1]->First->isOneOf(tok::kw_class, tok::kw_struct, tok::kw_union)) {
+      const bool IsEmptyBlock =
+          Line->Last->is(tok::l_brace) && NextLine->First->is(tok::r_brace);
+
+      if ((IsEmptyBlock && !Style.BraceWrapping.SplitEmptyRecord) ||
+          (!IsEmptyBlock &&
+           Style.AllowShortBlocksOnASingleLine == FormatStyle::SBS_Always)) {
+        return tryMergeSimpleBlock(I, E, Limit);
+      }
+    }
+
+    return 0;
+  }
+
   unsigned
   tryMergeSimplePPDirective(ArrayRef<AnnotatedLine *>::const_iterator I,
                             ArrayRef<AnnotatedLine *>::const_iterator E,
@@ -890,9 +964,16 @@ class LineJoiner {
         return 1;
       } else if (Limit != 0 && !Line.startsWithNamespace() &&
                  !startsExternCBlock(Line)) {
-        // We don't merge short records.
-        if (isRecordLBrace(*Line.Last))
+        // Merge short records only when requested.
+        if (Line.Last->isOneOf(TT_EnumLBrace, TT_RecordLBrace))
+          return 0;
+
+        if (Line.Last->isOneOf(TT_ClassLBrace, TT_StructLBrace,
+                               TT_UnionLBrace) &&
+            Line.Last != Line.First &&
+            Style.AllowShortRecordOnASingleLine != FormatStyle::SRS_Always) {
           return 0;
+        }
 
         // Check that we still have three lines and they fit into the limit.
         if (I + 2 == E || I[2]->Type == LT_Invalid)
@@ -945,9 +1026,17 @@ class LineJoiner {
         return 0;
       Limit -= 2;
       unsigned MergedLines = 0;
-      if (Style.AllowShortBlocksOnASingleLine != FormatStyle::SBS_Never ||
-          (I[1]->First == I[1]->Last && I + 2 != E &&
-           I[2]->First->is(tok::r_brace))) {
+
+      auto TryMergeBlock = [&] {
+        if (Style.AllowShortBlocksOnASingleLine != FormatStyle::SBS_Never ||
+            Style.AllowShortRecordOnASingleLine == FormatStyle::SRS_Always) {
+          return true;
+        }
+        return I[1]->First == I[1]->Last && I + 2 != E &&
+               I[2]->First->is(tok::r_brace);
+      };
+
+      if (TryMergeBlock()) {
         MergedLines = tryMergeSimpleBlock(I + 1, E, Limit);
         // If we managed to merge the block, count the statement header, which
         // is on a separate line.
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index ddf584c6ed818..08c962dd34bbb 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -953,7 +953,8 @@ static bool isIIFE(const UnwrappedLine &Line,
 
 static bool ShouldBreakBeforeBrace(const FormatStyle &Style,
                                    const FormatToken &InitialToken,
-                                   const bool IsJavaRecord) {
+                                   bool IsEmptyBlock,
+                                   bool IsJavaRecord = false) {
   if (IsJavaRecord)
     return Style.BraceWrapping.AfterClass;
 
@@ -961,15 +962,20 @@ static bool ShouldBreakBeforeBrace(const FormatStyle &Style,
   if (InitialToken.is(TT_NamespaceMacro))
     Kind = tok::kw_namespace;
 
+  const bool WrapRecordAllowed =
+      !IsEmptyBlock ||
+      Style.AllowShortRecordOnASingleLine < FormatStyle::SRS_Empty ||
+      Style.BraceWrapping.SplitEmptyRecord;
+
   switch (Kind) {
   case tok::kw_namespace:
     return Style.BraceWrapping.AfterNamespace;
   case tok::kw_class:
-    return Style.BraceWrapping.AfterClass;
+    return Style.BraceWrapping.AfterClass && WrapRecordAllowed;
   case tok::kw_union:
-    return Style.BraceWrapping.AfterUnion;
+    return Style.BraceWrapping.AfterUnion && WrapRecordAllowed;
   case tok::kw_struct:
-    return Style.BraceWrapping.AfterStruct;
+    return Style.BraceWrapping.AfterStruct && WrapRecordAllowed;
   case tok::kw_enum:
     return Style.BraceWrapping.AfterEnum;
   default:
@@ -1141,12 +1147,6 @@ void UnwrappedLineParser::parsePPElse() {
 void UnwrappedLineParser::parsePPEndIf() {
   conditionalCompilationEnd();
   parsePPUnknown();
-  // If the #endif of a potential include guard is the last thing in the file,
-  // then we found an include guard.
-  if (IncludeGuard == IG_Defined && PPBranchLevel == -1 && Tokens->isEOF() &&
-      getIncludeGuardState(Style.IndentPPDirectives) == IG_Inited) {
-    IncludeGuard = IG_Found;
-  }
 }
 
 void UnwrappedLineParser::parsePPDefine() {
@@ -3208,8 +3208,10 @@ void UnwrappedLineParser::parseNamespace() {
   if (FormatTok->is(tok::l_brace)) {
     FormatTok->setFinalizedType(TT_NamespaceLBrace);
 
-    if (ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false))
+    if (ShouldBreakBeforeBrace(Style, InitialToken,
+                               Tokens->peekNextToken()->is(tok::r_brace))) {
       addUnwrappedLine();
+    }
 
     unsigned AddLevels =
         Style.NamespaceIndentation == FormatStyle::NI_All ||
@@ -3872,7 +3874,8 @@ bool UnwrappedLineParser::parseEnum() {
   }
 
   if (!Style.AllowShortEnumsOnASingleLine &&
-      ShouldBreakBeforeBrace(Style, InitialToken, /*IsJavaRecord=*/false)) {
+      ShouldBreakBeforeBrace(Style, InitialToken,
+                             Tokens->peekNextToken()->is(tok::r_brace))) {
     addUnwrappedLine();
   }
   // Parse enum body.
@@ -4167,8 +4170,11 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr, bool IsJavaRecord) {
     if (ParseAsExpr) {
       parseChildBlock();
     } else {
-      if (ShouldBreakBeforeBrace(Style, InitialToken, IsJavaRecord))
+      if (ShouldBreakBeforeBrace(Style, InitialToken,
+                                 Tokens->peekNextToken()->is(tok::r_brace),
+                                 IsJavaRecord)) {
         addUnwrappedLine();
+      }
 
       unsigned AddLevels = Style.IndentAccessModifiers ? 2u : 1u;
       parseBlock(/*MustBeDeclaration=*/true, AddLevels, /*MunchSemi=*/false);
@@ -4940,10 +4946,20 @@ void UnwrappedLineParser::readToken(int LevelDifference) {
       assert(Line->Level >= Line->UnbracedBodyLevel);
       Line->Level -= Line->UnbracedBodyLevel;
       flushComments(isOnNewLine(*FormatTok));
+      const bool IsEndIf = Tokens->peekNextToken()->is(tok::pp_endif);
       parsePPDirective();
       PreviousWasComment = FormatTok->is(tok::comment);
       FirstNonCommentOnLine = IsFirstNonCommentOnLine(
           FirstNonCommentOnLine, *FormatTok, PreviousWasComment);
+      // If the #endif of a potential include guard is the last thing in the
+      // file, then we found an include guard.
+      if (IsEndIf && IncludeGuard == IG_Defined && PPBranchLevel == -1 &&
+          getIncludeGuardState(Style.IndentPPDirectives) == IG_Inited &&
+          (eof() ||
+           (PreviousWasComment &&
+            Tokens->peekNextToken(/*SkipComment=*/true)->is(tok::eof)))) {
+        IncludeGuard = IG_Found;
+      }
     }
 
     if (!PPStack.empty() && (PPStack.back().Kind == PP_Unreachable) &&
diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 947578cbff9ad..0a79e708269e2 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -284,6 +284,43 @@ void WhitespaceManager::calculateLineBreakInformation() {
   }
 }
 
+// Sets the spaces in front of a Change, and updates the start/end columns of
+// subsequent tokens so that trailing comments and escaped newlines can be
+// aligned properly.
+static void
+SetChangeSpaces(unsigned Start, unsigned Spaces,
+                MutableArrayRef<WhitespaceManager::Change> Changes) {
+  auto &FirstChange = Changes[Start];
+  const int ColumnChange = Spaces - FirstChange.Spaces;
+
+  if (ColumnChange == 0)
+    return;
+
+  FirstChange.Spaces += ColumnChange;
+  FirstChange.StartOfTokenColumn += ColumnChange;
+
+  for (auto I = Start + 1; I < Changes.size(); I++) {
+    auto &Change = Changes[I];
+
+    Change.PreviousEndOfTokenColumn += ColumnChange;
+
+    if (Change.NewlinesBefore > 0)
+      break;
+
+    Change.StartOfTokenColumn += ColumnChange;
+  }
+}
+
+// Changes the spaces in front of a change by Delta, and updates the start/end
+// columns of subsequent tokens so that trailing comments and escaped newlines
+// can be aligned properly.
+static void
+IncrementChangeSpaces(unsigned Start, int Delta,
+                      MutableArrayRef<WhitespaceManager::Change> Changes) {
+  assert(Delta > 0 || (abs(Delta) <= Changes[Start].Spaces));
+  SetChangeSpaces(Start, Changes[Start].Spaces + Delta, Changes);
+}
+
 // Align a single sequence of tokens, see AlignTokens below.
 // Column - The tokens indexed in Matches are moved to this column.
 // RightJustify - Whether it is the token's right end or left end that gets
@@ -295,9 +332,6 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
                    SmallVector<WhitespaceManager::Change, 16> &Changes) {
   unsigned OriginalMatchColumn = 0;
   int Shift = 0;
-  // Set when the shift is applied anywhere in the line. Cleared when the line
-  // ends.
-  bool LineShifted = false;
 
   // ScopeStack keeps track of the current scope depth. It contains the levels
   // of at most 2 scopes. The first one is the one that the matched token is
@@ -347,11 +381,8 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
          (CurrentChange.indentAndNestingLevel() == ScopeStack[0] &&
           CurrentChange.IndentedFromColumn >= OriginalMatchColumn));
 
-    if (CurrentChange.NewlinesBefore > 0) {
-      LineShifted = false;
-      if (!InsideNestedScope)
-        Shift = 0;
-    }
+    if (CurrentChange.NewlinesBefore > 0 && !InsideNestedScope)
+      Shift = 0;
 
     // If this is the first matching token to be aligned, remember by how many
     // spaces it has to be shifted, so the rest of the changes on the line are
@@ -372,9 +403,8 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
     if ((!Matches.empty() && Matches[0] == i) ||
         (ScopeStack.size() == 1u && CurrentChange.NewlinesBefore > 0 &&
          InsideNestedScope)) {
-      LineShifted = true;
       CurrentChange.IndentedFromColumn += Shift;
-      CurrentChange.Spaces += Shift;
+      IncrementChangeSpaces(i, Shift, Changes);
     }
 
     // We should not remove required spaces unless we break the line before.
@@ -383,12 +413,6 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
                static_cast<int>(Changes[i].Tok->SpacesRequiredBefore) ||
            CurrentChange.Tok->is(tok::eof));
 
-    if (LineShifted) {
-      CurrentChange.StartOfTokenColumn += Shift;
-      if (i + 1 != Changes.size())
-        Changes[i + 1].PreviousEndOfTokenColumn += Shift;
-    }
-
     // If PointerAlignment is PAS_Right, keep *s or &s next to the token,
     // except if the token is equal, then a space is needed.
     if ((Style.PointerAlignment == FormatStyle::PAS_Right ||
@@ -409,9 +433,9 @@ AlignTokenSequence(const FormatStyle &Style, unsigned Start, unsigned End,
         } else if (Style.PointerAlignment != FormatStyle::PAS_Right) {
           continue;
         }
-        Changes[Previous + 1].Spaces -= Shift;
-        Changes[Previous].Spaces += Shift;
-        Changes[Previous].StartOfTokenColumn += Shift;
+
+        IncrementChangeSpaces(Previous + 1, -Shift, Changes);
+        IncrementChangeSpaces(Previous, Shift, Changes);
       }
     }
   }
@@ -692,27 +716,19 @@ static void AlignMatchingTokenSequence(
     SmallVector<WhitespaceManager::Change, 16> &Changes) {
   if (StartOfSequence > 0 && StartOfSequence < EndOfSequence) {
     bool FoundMatchOnLine = false;
-    int Shift = 0;
 
     for (unsigned I = StartOfSequence; I != EndOfSequence; ++I) {
-      if (Changes[I].NewlinesBefore > 0) {
-        Shift = 0;
+      if (Changes[I].NewlinesBefore > 0)
         FoundMatchOnLine = false;
-      }
 
       // If this is the first matching token to be aligned, remember by how many
       // spaces it has to be shifted, so the rest of the changes on the line are
       // shifted by the same amount.
       if (!FoundMatchOnLine && Matches(Changes[I])) {
         FoundMatchOnLine = true;
-        Shift = MinColumn - Changes[I].StartOfTokenColumn;
-        Changes[I].Spaces += Shift;
+        int Shift = MinColumn - Changes[I].StartOfTokenColumn;
+        IncrementChangeSpaces(I, Shift, Changes);
       }
-
-      assert(Shift >= 0);
-      Changes[I].StartOfTokenColumn += Shift;
-      if (I + 1 != Changes.size())
-        Changes[I + 1].PreviousEndOfTokenColumn += Shift;
     }
   }
 
@@ -1064,7 +1080,10 @@ void WhitespaceManager::alignTrailingComments() {
       // leave the comments.
       if (RestoredLineLength >= Style.ColumnLimit && Style.ColumnLimit > 0)
         break;
-      C.Spaces = C.NewlinesBefore > 0 ? C.Tok->OriginalColumn : OriginalSpaces;
+
+      int Spaces =
+          C.NewlinesBefore > 0 ? C.Tok->OriginalColumn : OriginalSpaces;
+      setChangeSpaces(I, Spaces);
       continue;
     }
 
@@ -1185,10 +1204,8 @@ void WhitespaceManager::alignTrailingComments(unsigned Start, unsigned End,
     }
     if (Shift <= 0)
       continue;
-    Changes[i].Spaces += Shift;
-    if (i + 1 != Changes.size())
-      Changes[i + 1].PreviousEndOfTokenColumn += Shift;
-    Changes[i].StartOfTokenColumn += Shift;
+
+    setChangeSpaces(i, Changes[i].Spaces + Shift);
   }
 }
 
@@ -1294,8 +1311,8 @@ void WhitespaceManager::alignArrayInitializersRightJustified(
       do {
         const FormatToken *Previous = Changes[Next->Index].Tok->Previous;
         if (Previous && Previous->isNot(TT_LineComment)) {
-          Changes[Next->Index].Spaces = BracePadding;
           Changes[Next->Index].NewlinesBefore = 0;
+          setChangeSpaces(Next->Index, BracePadding);
         }
         Next = Next->NextColumnElement;
       } while (Next);
@@ -1308,7 +1325,7 @@ void WhitespaceManager::alignArrayInitializersRightJustified(
             Cells.begin(), CellIter, CellDescs.InitialSpaces,
             CellDescs.CellCounts[0], CellDescs.CellCounts.size());
         if (ThisNetWidth < MaxNetWidth)
-          Changes[CellIter->Index].Spaces = (MaxNetWidth - ThisNetWidth);
+          setChangeSpaces(CellIter->Index, MaxNetWidth - ThisNetWidth);
         auto RowCount = 1U;
         auto Offset = std::distance(Cells.begin(), CellIter);
         for (const auto *Next = CellIter->NextColumnElement; Next;
@@ -1319,7 +1336,7 @@ void WhitespaceManager::alignArrayInitializersRightJustified(
           auto *End = Start + Offset;
           ThisNetWidth = getNetWidth(Start, End, CellDescs.InitialSpaces);
           if (ThisNetWidth < MaxNetWidth)
-            Changes[Next->Index].Spaces = (MaxNetWidth - ThisNetWidth);
+            setChangeSpaces(Next->Index, MaxNetWidth - ThisNetWidth);
           ++RowCount;
         }
       }
@@ -1328,8 +1345,10 @@ void WhitespaceManager::alignArrayInitializersRightJustified(
           calculateCellWidth(CellIter->Index, CellIter->EndIndex, true) +
           NetWidth;
       if (Changes[CellIter->Index].NewlinesBefore == 0) {
-        Changes[CellIter->Index].Spaces = (CellWidth - (ThisWidth + NetWidth));
-        Changes[CellIter->Index].Spaces += (i > 0) ? 1 : BracePadding;
+        int Spaces = (CellWidth - (ThisWidth + NetWidth));
+        Spaces += (i > 0) ? 1 : BracePadding;
+
+        setChangeSpaces(CellIter->Index, Spaces);
       }
       alignToStartOfCell(CellIter->Index, CellIter->EndIndex);
       for (const auto *Next = CellIter->NextColumnElement; Next;
@@ -1337,8 +1356,10 @@ void WhitespaceManager::alignArrayInitializersRightJustified(
         ThisWidth =
             calculateCellWidth(Next->Index, Next->EndIndex, true) + NetWidth;
         if (Changes[Next->Index].NewlinesBefore == 0) {
-          Changes[Next->Index].Spaces = (CellWidth - ThisWidth);
-          Changes[Next->Index].Spaces += (i > 0) ? 1 : BracePadding;
+          int Spaces = (CellWidth - ThisWidth);
+          Spaces += (i > 0) ? 1 : BracePadding;
+
+          setChangeSpaces(Next->Index, Spaces);
         }
         alignToStartOfCell(Next->Index, Next->EndIndex);
       }
@@ -1360,8 +1381,9 @@ void WhitespaceManager::alignArrayInitializersLeftJustified(
   // The first cell of every row needs to be against the left brace.
   for (const auto *Next = CellIter; Next; Next = Next->NextColumnElement) {
     auto &Change = Changes[Next->Index];
-    Change.Spaces =
+    int Spaces =
         Change.NewlinesBefore == 0 ? BracePadding : CellDescs.InitialSpaces;
+    setChangeSpaces(Next->Index, Spaces);
   }
   ++CellIter;
   for (auto i = 1U; i < CellDescs.CellCounts[0]; i++, ++CellIter) {
@@ -1371,10 +1393,11 @@ void WhitespaceManager::alignArrayInitializersLeftJustified(
     auto ThisNetWidth =
         getNetWidth(Cells.begin(), CellIter, CellDescs.InitialSpaces);
     if (Changes[CellIter->Index].NewlinesBefore == 0) {
-      Changes[CellIter->Index].Spaces =
+      int Spaces =
           MaxNetWidth - ThisNetWidth +
           (Changes[CellIter->Index].Tok->isNot(tok::r_brace) ? 1
                                                              : BracePadding);
+      setChangeSpaces(CellIter->Index, Spaces);
     }
     auto RowCount = 1U;
     auto Offset = std::distance(Cells.begin(), CellIter);
@@ -1386,9 +1409,10 @@ void WhitespaceManager::alignArrayInitializersLeftJustified(
       auto *End = Start + Offset;
       auto ThisNetWidth = getNetWidth(Start, End, CellDescs.InitialSpaces);
       if (Changes[Next->Index].NewlinesBefore == 0) {
-        Changes[Next->Index].Spaces =
+        int Spaces =
             MaxNetWidth - ThisNetWidth +
             (Changes[Next->Index].Tok->isNot(tok::r_brace) ? 1 : BracePadding);
+        setChangeSpaces(Next->Index, Spaces);
       }
       ++RowCount;
     }
@@ -1466,11 +1490,11 @@ WhitespaceManager::CellDescriptions WhitespaceManager::getCells(unsigned Start,
             Changes[j].Tok->isNot(tok::r_brace)) {
           Changes[j].NewlinesBefore = 1;
           // Account for the added token lengths
-          Changes[j].Spaces = InitialSpaces - InitialTokenLength;
+          setChangeSpaces(j, InitialSpaces - InitialTokenLength);
         }
       } else if (C.Tok->is(tok::comment) && C.Tok->NewlinesBefore == 0) {
         // Trailing comments stay at a space past the last token
-        C.Spaces = Changes[i - 1].Tok->is(tok::comma) ? 1 : 2;
+        setChangeSpaces(i, Changes[i - 1].Tok->is(tok::comma) ? 1 : 2);
       } else if (C.Tok->is(tok::l_brace)) {
         // We need to make sure that the ending braces is aligned to the
         // start of our initializer
@@ -1481,7 +1505,7 @@ WhitespaceManager::CellDescriptions WhitespaceManager::getCells(unsigned Start,
       }
     } else if (Depth == 0 && C.Tok->is(tok::r_brace)) {
       C.NewlinesBefore = 1;
-      C.Spaces = EndSpaces;
+      setChangeSpaces(i, EndSpaces);
     }
     if (C.Tok->StartsColumn) {
       // This gets us past tokens that have been split over multiple
@@ -1509,12 +1533,12 @@ WhitespaceManager::CellDescriptions WhitespaceManager::getCells(unsigned Start,
           auto LineLimit = Changes[j].Spaces + Changes[j].TokenLength;
           if (LineLimit < Style.ColumnLimit) {
             Changes[i].NewlinesBefore = 0;
-            Changes[i].Spaces = 1;
+            setChangeSpaces(i, 1);
           }
         }
       }
       while (Changes[i].NewlinesBefore > 0 && Changes[i].Tok == C.Tok) {
-        Changes[i].Spaces = InitialSpaces;
+        setChangeSpaces(i, InitialSpaces);
         ++i;
         HasSplit = true;
       }
@@ -1546,7 +1570,7 @@ void WhitespaceManager::alignToStartOfCell(unsigned Start, unsigned End) {
   // is aligned to the parent
   for (auto i = Start + 1; i < End; i++)
     if (Changes[i].NewlinesBefore > 0)
-      Changes[i].Spaces = Changes[Start].Spaces;
+      setChangeSpaces(i, Changes[Start].Spaces);
 }
 
 WhitespaceManager::CellDescriptions
@@ -1565,6 +1589,10 @@ WhitespaceManager::linkCells(CellDescriptions &&CellDesc) {
   return std::move(CellDesc);
 }
 
+void WhitespaceManager::setChangeSpaces(unsigned Start, unsigned Spaces) {
+  SetChangeSpaces(Start, Spaces, Changes);
+}
+
 void WhitespaceManager::generateChanges() {
   for (unsigned i = 0, e = Changes.size(); i != e; ++i) {
     const Change &C = Changes[i];
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index 64a8f9b4fa857..9b6cde54af0af 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -357,6 +357,8 @@ class WhitespaceManager {
   /// Link the Cell pointers in the list of Cells.
   static CellDescriptions linkCells(CellDescriptions &&CellDesc);
 
+  void setChangeSpaces(unsigned Start, unsigned Spaces);
+
   /// Fill \c Replaces with the replacements for all effective changes.
   void generateChanges();
 
diff --git a/clang/lib/FrontendTool/CMakeLists.txt b/clang/lib/FrontendTool/CMakeLists.txt
index 66213f76eb968..a451eb967e904 100644
--- a/clang/lib/FrontendTool/CMakeLists.txt
+++ b/clang/lib/FrontendTool/CMakeLists.txt
@@ -4,6 +4,8 @@ set(LLVM_LINK_COMPONENTS
   )
 
 set(link_libs
+  clangScalableStaticAnalysisFrameworkCore
+  clangScalableStaticAnalysisFrameworkFrontend
   clangBasic
   clangCodeGen
   clangDriver
diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
index c8ad63bc989a4..e4622496758ac 100644
--- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
+++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp
@@ -23,6 +23,8 @@
 #include "clang/FrontendTool/Utils.h"
 #include "clang/Options/Options.h"
 #include "clang/Rewrite/Frontend/FrontendActions.h"
+#include "clang/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.h"
+#include "clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h" // IWYU pragma: keep
 #include "clang/StaticAnalyzer/Frontend/AnalyzerHelpFlags.h"
 #include "clang/StaticAnalyzer/Frontend/FrontendActions.h"
 #include "llvm/Option/OptTable.h"
@@ -207,6 +209,10 @@ CreateFrontendAction(CompilerInstance &CI) {
     Act = std::make_unique<ASTMergeAction>(std::move(Act),
                                             FEOpts.ASTMergeFiles);
 
+  if (!FEOpts.SSAFTUSummaryFile.empty()) {
+    Act = std::make_unique<ssaf::TUSummaryExtractorFrontendAction>(
+        std::move(Act));
+  }
   return Act;
 }
 
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 0bb503484299d..29397d67b5bcc 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -1758,14 +1758,14 @@ parseOpenMPSimpleClause(Parser &P, OpenMPClauseKind Kind) {
 void Parser::ParseOMPDeclareTargetClauses(
     SemaOpenMP::DeclareTargetContextInfo &DTCI) {
   SourceLocation DeviceTypeLoc;
-  bool RequiresToOrLinkOrIndirectClause = false;
-  bool HasToOrLinkOrIndirectClause = false;
+  bool RequiresToLinkLocalOrIndirectClause = false;
+  bool HasToLinkLocalOrIndirectClause = false;
   while (Tok.isNot(tok::annot_pragma_openmp_end)) {
     OMPDeclareTargetDeclAttr::MapTypeTy MT = OMPDeclareTargetDeclAttr::MT_To;
     bool HasIdentifier = Tok.is(tok::identifier);
     if (HasIdentifier) {
-      // If we see any clause we need a to or link clause.
-      RequiresToOrLinkOrIndirectClause = true;
+      // If we see any clause we need a to, link, or local clause.
+      RequiresToLinkLocalOrIndirectClause = true;
       IdentifierInfo *II = Tok.getIdentifierInfo();
       StringRef ClauseName = II->getName();
       bool IsDeviceTypeClause =
@@ -1774,6 +1774,7 @@ void Parser::ParseOMPDeclareTargetClauses(
 
       bool IsIndirectClause = getLangOpts().OpenMP >= 51 &&
                               getOpenMPClauseKind(ClauseName) == OMPC_indirect;
+
       if (DTCI.Indirect && IsIndirectClause) {
         unsigned OMPVersion = Actions.getLangOpts().OpenMP;
         Diag(Tok, diag::err_omp_more_one_clause)
@@ -1781,9 +1782,9 @@ void Parser::ParseOMPDeclareTargetClauses(
             << getOpenMPClauseName(OMPC_indirect) << 0;
         break;
       }
-      bool IsToEnterOrLinkClause =
+      bool IsToEnterLinkOrLocalClause =
           OMPDeclareTargetDeclAttr::ConvertStrToMapTypeTy(ClauseName, MT);
-      assert((!IsDeviceTypeClause || !IsToEnterOrLinkClause) &&
+      assert((!IsDeviceTypeClause || !IsToEnterLinkOrLocalClause) &&
              "Cannot be both!");
 
       // Starting with OpenMP 5.2 the `to` clause has been replaced by the
@@ -1797,25 +1798,42 @@ void Parser::ParseOMPDeclareTargetClauses(
         break;
       }
 
+      // The 'local' clause is only available in OpenMP 6.0.
+      if (getLangOpts().OpenMP < 60 && ClauseName == "local") {
+        Diag(Tok, getLangOpts().OpenMP >= 52
+                      ? diag::err_omp_declare_target_unexpected_clause_52
+                      : diag::err_omp_declare_target_unexpected_clause)
+            << ClauseName
+            << (getLangOpts().OpenMP >= 51   ? 4
+                : getLangOpts().OpenMP >= 50 ? 2
+                                             : 1);
+        break;
+      }
+
       if (!IsDeviceTypeClause && !IsIndirectClause &&
           DTCI.Kind == OMPD_begin_declare_target) {
-        Diag(Tok, diag::err_omp_declare_target_unexpected_clause)
+        Diag(Tok, getLangOpts().OpenMP >= 52
+                      ? diag::err_omp_declare_target_unexpected_clause_52
+                      : diag::err_omp_declare_target_unexpected_clause)
             << ClauseName << (getLangOpts().OpenMP >= 51 ? 3 : 0);
         break;
       }
-      if (!IsDeviceTypeClause && !IsToEnterOrLinkClause && !IsIndirectClause) {
+
+      if (!IsDeviceTypeClause && !IsToEnterLinkOrLocalClause &&
+          !IsIndirectClause) {
         Diag(Tok, getLangOpts().OpenMP >= 52
                       ? diag::err_omp_declare_target_unexpected_clause_52
                       : diag::err_omp_declare_target_unexpected_clause)
             << ClauseName
-            << (getLangOpts().OpenMP >= 51
-                    ? 4
-                    : getLangOpts().OpenMP >= 50 ? 2 : 1);
+            << (getLangOpts().OpenMP > 52    ? 5
+                : getLangOpts().OpenMP >= 51 ? 4
+                : getLangOpts().OpenMP >= 50 ? 2
+                                             : 1);
         break;
       }
 
-      if (IsToEnterOrLinkClause || IsIndirectClause)
-        HasToOrLinkOrIndirectClause = true;
+      if (IsToEnterLinkOrLocalClause || IsIndirectClause)
+        HasToLinkLocalOrIndirectClause = true;
 
       if (IsIndirectClause) {
         if (!ParseOpenMPIndirectClause(DTCI, /*ParseOnly*/ false))
@@ -1892,14 +1910,14 @@ void Parser::ParseOMPDeclareTargetClauses(
   if (DTCI.Indirect && DTCI.DT != OMPDeclareTargetDeclAttr::DT_Any)
     Diag(DeviceTypeLoc, diag::err_omp_declare_target_indirect_device_type);
 
-  // For declare target require at least 'to' or 'link' to be present.
-  if (DTCI.Kind == OMPD_declare_target && RequiresToOrLinkOrIndirectClause &&
-      !HasToOrLinkOrIndirectClause)
-    Diag(DTCI.Loc,
-         getLangOpts().OpenMP >= 52
-             ? diag::err_omp_declare_target_missing_enter_or_link_clause
-             : diag::err_omp_declare_target_missing_to_or_link_clause)
-        << (getLangOpts().OpenMP >= 51 ? 1 : 0);
+  // declare target requires at least one clause.
+  if (DTCI.Kind == OMPD_declare_target && RequiresToLinkLocalOrIndirectClause &&
+      !HasToLinkLocalOrIndirectClause)
+    Diag(DTCI.Loc, diag::err_omp_declare_target_missing_required_clause)
+        << (getLangOpts().OpenMP >= 60   ? 3
+            : getLangOpts().OpenMP == 52 ? 2
+            : getLangOpts().OpenMP == 51 ? 1
+                                         : 0);
 
   SkipUntil(tok::annot_pragma_openmp_end, StopBeforeMatch);
 }
diff --git a/clang/lib/ScalableStaticAnalysisFramework/CMakeLists.txt b/clang/lib/ScalableStaticAnalysisFramework/CMakeLists.txt
index 194a13a1af845..d3d75430233fe 100644
--- a/clang/lib/ScalableStaticAnalysisFramework/CMakeLists.txt
+++ b/clang/lib/ScalableStaticAnalysisFramework/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(Core)
+add_subdirectory(Frontend)
diff --git a/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat/JSONFormatImpl.cpp b/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat/JSONFormatImpl.cpp
index 0f1b9ccf6258e..4072532d4972c 100644
--- a/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat/JSONFormatImpl.cpp
+++ b/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat/JSONFormatImpl.cpp
@@ -9,18 +9,17 @@
 #include "JSONFormatImpl.h"
 
 #include "clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h"
-#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/TUSummary.h"
 #include "llvm/Support/Registry.h"
 
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+volatile int SSAFJSONFormatAnchorSource = 0;
 LLVM_INSTANTIATE_REGISTRY(llvm::Registry<clang::ssaf::JSONFormat::FormatInfo>)
 
 static clang::ssaf::SerializationFormatRegistry::Add<clang::ssaf::JSONFormat>
-    RegisterJSONFormat("JSON", "JSON serialization format");
+    RegisterJSONFormat("json", "JSON serialization format");
 
 namespace clang::ssaf {
 
-void initializeJSONFormat() {}
-
 //----------------------------------------------------------------------------
 // JSON Reader and Writer
 //----------------------------------------------------------------------------
diff --git a/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.cpp b/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.cpp
index 15e0d497181aa..0eb86f2fdee2f 100644
--- a/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.cpp
+++ b/clang/lib/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.cpp
@@ -29,3 +29,9 @@ ssaf::makeFormat(llvm::StringRef FormatName) {
   assert(false && "Unknown SerializationFormat name");
   return nullptr;
 }
+
+void ssaf::printAvailableFormats(llvm::raw_ostream &OS) {
+  OS << "OVERVIEW: Available SSAF serialization formats:\n\n";
+  for (const auto &Entry : SerializationFormatRegistry::entries())
+    OS << "  " << Entry.getName() << " - " << Entry.getDesc() << "\n";
+}
diff --git a/clang/lib/ScalableStaticAnalysisFramework/Core/SummaryData/LUSummaryConsumer.cpp b/clang/lib/ScalableStaticAnalysisFramework/Core/SummaryData/LUSummaryConsumer.cpp
index e6f21d8c54799..79b1e23bdd9e8 100644
--- a/clang/lib/ScalableStaticAnalysisFramework/Core/SummaryData/LUSummaryConsumer.cpp
+++ b/clang/lib/ScalableStaticAnalysisFramework/Core/SummaryData/LUSummaryConsumer.cpp
@@ -55,7 +55,7 @@ LUSummaryConsumer::run(llvm::ArrayRef<SummaryName> Names) {
     }
     Store.Data.emplace(SN, std::move(*Result));
   }
-  return Store;
+  return std::move(Store);
 }
 
 SummaryDataStore LUSummaryConsumer::run() && {
diff --git a/clang/lib/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.cpp b/clang/lib/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.cpp
index 3dcfef7fc7b88..0275e92af24e8 100644
--- a/clang/lib/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.cpp
+++ b/clang/lib/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.cpp
@@ -33,3 +33,9 @@ ssaf::makeTUSummaryExtractor(llvm::StringRef SummaryName,
   assert(false && "Unknown SummaryExtractor name");
   return nullptr;
 }
+
+void ssaf::printAvailableTUSummaryExtractors(llvm::raw_ostream &OS) {
+  OS << "OVERVIEW: Available SSAF summary extractors:\n\n";
+  for (const auto &Entry : TUSummaryExtractorRegistry::entries())
+    OS << "  " << Entry.getName() << " - " << Entry.getDesc() << "\n";
+}
diff --git a/clang/lib/ScalableStaticAnalysisFramework/Frontend/CMakeLists.txt b/clang/lib/ScalableStaticAnalysisFramework/Frontend/CMakeLists.txt
new file mode 100644
index 0000000000000..b90d9c0ded1a9
--- /dev/null
+++ b/clang/lib/ScalableStaticAnalysisFramework/Frontend/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+add_clang_library(clangScalableStaticAnalysisFrameworkFrontend
+  TUSummaryExtractorFrontendAction.cpp
+
+  LINK_LIBS
+  clangAST
+  clangBasic
+  clangFrontend
+  clangScalableStaticAnalysisFrameworkCore
+  clangSema
+  )
diff --git a/clang/lib/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.cpp b/clang/lib/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.cpp
new file mode 100644
index 0000000000000..9a75b20fa548b
--- /dev/null
+++ b/clang/lib/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.cpp
@@ -0,0 +1,181 @@
+//===- TUSummaryExtractorFrontendAction.cpp -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/Basic/DiagnosticFrontend.h"
+#include "clang/Frontend/MultiplexConsumer.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/TUSummary.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/TUSummaryBuilder.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/TUSummaryExtractor.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Path.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace clang;
+using namespace ssaf;
+
+static std::optional<std::pair<llvm::StringRef, llvm::StringRef>>
+parseOutputFileFormatAndPathOrReportError(DiagnosticsEngine &Diags,
+                                          StringRef SSAFTUSummaryFile) {
+
+  StringRef Ext = llvm::sys::path::extension(SSAFTUSummaryFile);
+  StringRef FilePath = SSAFTUSummaryFile.drop_back(Ext.size());
+
+  if (!Ext.consume_front(".") || FilePath.empty()) {
+    Diags.Report(diag::warn_ssaf_extract_tu_summary_file_unknown_format)
+        << SSAFTUSummaryFile;
+    return std::nullopt;
+  }
+
+  if (!isFormatRegistered(Ext)) {
+    Diags.Report(diag::warn_ssaf_extract_tu_summary_file_unknown_output_format)
+        << Ext << SSAFTUSummaryFile;
+    return std::nullopt;
+  }
+
+  return std::pair{Ext, FilePath};
+}
+
+/// Return \c true if reported unrecognized extractors.
+static bool
+reportUnrecognizedExtractorNames(DiagnosticsEngine &Diags,
+                                 ArrayRef<std::string> SSAFExtractSummaries) {
+  if (SSAFExtractSummaries.empty()) {
+    Diags.Report(diag::warn_ssaf_must_enable_summary_extractors);
+    return true;
+  }
+
+  std::vector<StringRef> UnrecognizedExtractorNames;
+  for (StringRef Name : SSAFExtractSummaries)
+    if (!isTUSummaryExtractorRegistered(Name))
+      UnrecognizedExtractorNames.push_back(Name);
+
+  if (!UnrecognizedExtractorNames.empty()) {
+    Diags.Report(diag::warn_ssaf_extract_summary_unknown_extractor_name)
+        << UnrecognizedExtractorNames.size()
+        << llvm::join(UnrecognizedExtractorNames, ", ");
+    return true;
+  }
+
+  return false;
+}
+
+static std::vector<std::unique_ptr<ASTConsumer>>
+makeTUSummaryExtractors(TUSummaryBuilder &Builder,
+                        ArrayRef<std::string> SSAFExtractSummaries) {
+  std::vector<std::unique_ptr<ASTConsumer>> Extractors;
+  Extractors.reserve(SSAFExtractSummaries.size());
+  for (StringRef Name : SSAFExtractSummaries) {
+    assert(isTUSummaryExtractorRegistered(Name));
+    Extractors.push_back(makeTUSummaryExtractor(Name, Builder));
+  }
+  return Extractors;
+}
+
+namespace {
+
+/// Drives all extractor \c ASTConsumers and serializes the completed
+/// \c TUSummary.
+///
+/// Derives from \c MultiplexConsumer so every \c ASTConsumer virtual method is
+/// automatically forwarded to each extractor.
+class TUSummaryRunner final : public MultiplexConsumer {
+public:
+  static std::unique_ptr<TUSummaryRunner> create(CompilerInstance &CI,
+                                                 StringRef InFile);
+
+private:
+  TUSummaryRunner(StringRef InFile, std::unique_ptr<SerializationFormat> Format,
+                  const FrontendOptions &Opts);
+
+  void HandleTranslationUnit(ASTContext &Ctx) override;
+
+  TUSummary Summary;
+  TUSummaryBuilder Builder = TUSummaryBuilder(Summary);
+  std::unique_ptr<SerializationFormat> Format;
+  const FrontendOptions &Opts;
+};
+} // namespace
+
+std::unique_ptr<TUSummaryRunner> TUSummaryRunner::create(CompilerInstance &CI,
+                                                         StringRef InFile) {
+  const FrontendOptions &Opts = CI.getFrontendOpts();
+  DiagnosticsEngine &Diags = CI.getDiagnostics();
+
+  auto MaybePair =
+      parseOutputFileFormatAndPathOrReportError(Diags, Opts.SSAFTUSummaryFile);
+  if (!MaybePair.has_value())
+    return nullptr;
+  auto [FormatName, OutputPath] = MaybePair.value();
+
+  if (reportUnrecognizedExtractorNames(Diags, Opts.SSAFExtractSummaries))
+    return nullptr;
+
+  return std::unique_ptr<TUSummaryRunner>{
+      new TUSummaryRunner{InFile, makeFormat(FormatName), Opts}};
+}
+
+TUSummaryRunner::TUSummaryRunner(StringRef InFile,
+                                 std::unique_ptr<SerializationFormat> Format,
+                                 const FrontendOptions &Opts)
+    : MultiplexConsumer(std::vector<std::unique_ptr<ASTConsumer>>{}),
+      Summary(BuildNamespace(BuildNamespaceKind::CompilationUnit, InFile)),
+      Format(std::move(Format)), Opts(Opts) {
+  assert(this->Format);
+
+  // Now the Summary and the builders are constructed, we can also construct the
+  // extractors.
+  auto Extractors = makeTUSummaryExtractors(Builder, Opts.SSAFExtractSummaries);
+  assert(!Extractors.empty());
+
+  // We must initialize the Consumers here because our extractors need a
+  // Builder that holds a reference to the TUSummary, which would be only
+  // initialized after the MultiplexConsumer ctor. This is the only way we can
+  // avoid the use of the TUSummary before it starts its lifetime.
+  MultiplexConsumer::Consumers = std::move(Extractors);
+}
+
+void TUSummaryRunner::HandleTranslationUnit(ASTContext &Ctx) {
+  // First, invoke the Summary Extractors.
+  MultiplexConsumer::HandleTranslationUnit(Ctx);
+
+  // Then serialize the result.
+  if (auto Err = Format->writeTUSummary(Summary, Opts.SSAFTUSummaryFile)) {
+    Ctx.getDiagnostics().Report(diag::warn_ssaf_write_tu_summary_failed)
+        << Opts.SSAFTUSummaryFile << llvm::toString(std::move(Err));
+  }
+}
+
+TUSummaryExtractorFrontendAction::~TUSummaryExtractorFrontendAction() = default;
+
+TUSummaryExtractorFrontendAction::TUSummaryExtractorFrontendAction(
+    std::unique_ptr<FrontendAction> WrappedAction)
+    : WrapperFrontendAction(std::move(WrappedAction)) {}
+
+std::unique_ptr<ASTConsumer>
+TUSummaryExtractorFrontendAction::CreateASTConsumer(CompilerInstance &CI,
+                                                    StringRef InFile) {
+  auto WrappedConsumer = WrapperFrontendAction::CreateASTConsumer(CI, InFile);
+  if (!WrappedConsumer)
+    return nullptr;
+
+  if (auto Runner = TUSummaryRunner::create(CI, InFile)) {
+    std::vector<std::unique_ptr<ASTConsumer>> Consumers;
+    Consumers.reserve(2);
+    Consumers.push_back(std::move(WrappedConsumer));
+    Consumers.push_back(std::move(Runner));
+    return std::make_unique<MultiplexConsumer>(std::move(Consumers));
+  }
+  return WrappedConsumer;
+}
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 2824bf61526b7..e2bd8d7956561 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Sema/AnalysisBasedWarnings.h"
+#include "SemaLifetimeSafety.h"
 #include "TypeLocBuilder.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
@@ -31,7 +32,6 @@
 #include "clang/Analysis/Analyses/CFGReachabilityAnalysis.h"
 #include "clang/Analysis/Analyses/CalledOnceCheck.h"
 #include "clang/Analysis/Analyses/Consumed.h"
-#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeAnnotations.h"
 #include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
 #include "clang/Analysis/Analyses/ReachableCode.h"
 #include "clang/Analysis/Analyses/ThreadSafety.h"
@@ -2872,205 +2872,6 @@ class CallableVisitor : public DynamicRecursiveASTVisitor {
   }
 };
 
-namespace clang::lifetimes {
-namespace {
-class LifetimeSafetySemaHelperImpl : public LifetimeSafetySemaHelper {
-
-public:
-  LifetimeSafetySemaHelperImpl(Sema &S) : S(S) {}
-
-  void reportUseAfterFree(const Expr *IssueExpr, const Expr *UseExpr,
-                          const Expr *MovedExpr,
-                          SourceLocation FreeLoc) override {
-    S.Diag(IssueExpr->getExprLoc(),
-           MovedExpr ? diag::warn_lifetime_safety_use_after_scope_moved
-                     : diag::warn_lifetime_safety_use_after_scope)
-        << IssueExpr->getSourceRange();
-    if (MovedExpr)
-      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
-          << MovedExpr->getSourceRange();
-    S.Diag(FreeLoc, diag::note_lifetime_safety_destroyed_here);
-    S.Diag(UseExpr->getExprLoc(), diag::note_lifetime_safety_used_here)
-        << UseExpr->getSourceRange();
-  }
-
-  void reportUseAfterReturn(const Expr *IssueExpr, const Expr *ReturnExpr,
-                            const Expr *MovedExpr,
-                            SourceLocation ExpiryLoc) override {
-    S.Diag(IssueExpr->getExprLoc(),
-           MovedExpr ? diag::warn_lifetime_safety_return_stack_addr_moved
-                     : diag::warn_lifetime_safety_return_stack_addr)
-        << IssueExpr->getSourceRange();
-    if (MovedExpr)
-      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
-          << MovedExpr->getSourceRange();
-    S.Diag(ReturnExpr->getExprLoc(), diag::note_lifetime_safety_returned_here)
-        << ReturnExpr->getSourceRange();
-  }
-
-  void reportDanglingField(const Expr *IssueExpr,
-                           const FieldDecl *DanglingField,
-                           const Expr *MovedExpr,
-                           SourceLocation ExpiryLoc) override {
-    S.Diag(IssueExpr->getExprLoc(),
-           MovedExpr ? diag::warn_lifetime_safety_dangling_field_moved
-                     : diag::warn_lifetime_safety_dangling_field)
-        << IssueExpr->getSourceRange();
-    if (MovedExpr)
-      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
-          << MovedExpr->getSourceRange();
-    S.Diag(DanglingField->getLocation(),
-           diag::note_lifetime_safety_dangling_field_here)
-        << DanglingField->getEndLoc();
-  }
-
-  void reportDanglingGlobal(const Expr *IssueExpr,
-                            const VarDecl *DanglingGlobal,
-                            const Expr *MovedExpr,
-                            SourceLocation ExpiryLoc) override {
-    S.Diag(IssueExpr->getExprLoc(),
-           MovedExpr ? diag::warn_lifetime_safety_dangling_global_moved
-                     : diag::warn_lifetime_safety_dangling_global)
-        << IssueExpr->getSourceRange();
-    if (MovedExpr)
-      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
-          << MovedExpr->getSourceRange();
-    if (DanglingGlobal->isStaticLocal() || DanglingGlobal->isStaticDataMember())
-      S.Diag(DanglingGlobal->getLocation(),
-             diag::note_lifetime_safety_dangling_static_here)
-          << DanglingGlobal->getEndLoc();
-    else
-      S.Diag(DanglingGlobal->getLocation(),
-             diag::note_lifetime_safety_dangling_global_here)
-          << DanglingGlobal->getEndLoc();
-  }
-
-  void reportUseAfterInvalidation(const Expr *IssueExpr, const Expr *UseExpr,
-                                  const Expr *InvalidationExpr) override {
-    S.Diag(IssueExpr->getExprLoc(), diag::warn_lifetime_safety_invalidation)
-        << false << IssueExpr->getSourceRange();
-    S.Diag(InvalidationExpr->getExprLoc(),
-           diag::note_lifetime_safety_invalidated_here)
-        << InvalidationExpr->getSourceRange();
-    S.Diag(UseExpr->getExprLoc(), diag::note_lifetime_safety_used_here)
-        << UseExpr->getSourceRange();
-  }
-  void reportUseAfterInvalidation(const ParmVarDecl *PVD, const Expr *UseExpr,
-                                  const Expr *InvalidationExpr) override {
-    S.Diag(PVD->getSourceRange().getBegin(),
-           diag::warn_lifetime_safety_invalidation)
-        << true << PVD->getSourceRange();
-    S.Diag(InvalidationExpr->getExprLoc(),
-           diag::note_lifetime_safety_invalidated_here)
-        << InvalidationExpr->getSourceRange();
-    S.Diag(UseExpr->getExprLoc(), diag::note_lifetime_safety_used_here)
-        << UseExpr->getSourceRange();
-  }
-
-  void suggestLifetimeboundToParmVar(SuggestionScope Scope,
-                                     const ParmVarDecl *ParmToAnnotate,
-                                     const Expr *EscapeExpr) override {
-    unsigned DiagID =
-        (Scope == SuggestionScope::CrossTU)
-            ? diag::warn_lifetime_safety_cross_tu_param_suggestion
-            : diag::warn_lifetime_safety_intra_tu_param_suggestion;
-    SourceLocation InsertionPoint = Lexer::getLocForEndOfToken(
-        ParmToAnnotate->getEndLoc(), 0, S.getSourceManager(), S.getLangOpts());
-    StringRef FixItText = " [[clang::lifetimebound]]";
-    if (!ParmToAnnotate->getIdentifier()) {
-      // For unnamed parameters, placing attributes after the type would be
-      // parsed as a type attribute, not a parameter attribute.
-      InsertionPoint = ParmToAnnotate->getBeginLoc();
-      FixItText = "[[clang::lifetimebound]] ";
-    }
-    S.Diag(ParmToAnnotate->getBeginLoc(), DiagID)
-        << ParmToAnnotate->getSourceRange()
-        << FixItHint::CreateInsertion(InsertionPoint, FixItText);
-    S.Diag(EscapeExpr->getBeginLoc(),
-           diag::note_lifetime_safety_suggestion_returned_here)
-        << EscapeExpr->getSourceRange();
-  }
-
-  void suggestLifetimeboundToImplicitThis(SuggestionScope Scope,
-                                          const CXXMethodDecl *MD,
-                                          const Expr *EscapeExpr) override {
-    unsigned DiagID = (Scope == SuggestionScope::CrossTU)
-                          ? diag::warn_lifetime_safety_cross_tu_this_suggestion
-                          : diag::warn_lifetime_safety_intra_tu_this_suggestion;
-    const auto MDL = MD->getTypeSourceInfo()->getTypeLoc();
-    SourceLocation InsertionPoint = Lexer::getLocForEndOfToken(
-        MDL.getEndLoc(), 0, S.getSourceManager(), S.getLangOpts());
-    if (const auto *FPT = MD->getType()->getAs<FunctionProtoType>();
-        FPT && FPT->hasTrailingReturn()) {
-      // For trailing return types, 'getEndLoc()' includes the return type
-      // after '->', placing the attribute in an invalid position.
-      // Instead use 'getLocalRangeEnd()' which gives the '->' location
-      // for trailing returns, so find the last token before it.
-      const auto FTL = MDL.getAs<FunctionTypeLoc>();
-      assert(FTL);
-      InsertionPoint = Lexer::getLocForEndOfToken(
-          Lexer::findPreviousToken(FTL.getLocalRangeEnd(), S.getSourceManager(),
-                                   S.getLangOpts(),
-                                   /*IncludeComments=*/false)
-              ->getLocation(),
-          0, S.getSourceManager(), S.getLangOpts());
-    }
-    S.Diag(InsertionPoint, DiagID)
-        << MD->getNameInfo().getSourceRange()
-        << FixItHint::CreateInsertion(InsertionPoint,
-                                      " [[clang::lifetimebound]]");
-    S.Diag(EscapeExpr->getBeginLoc(),
-           diag::note_lifetime_safety_suggestion_returned_here)
-        << EscapeExpr->getSourceRange();
-  }
-
-  void reportNoescapeViolation(const ParmVarDecl *ParmWithNoescape,
-                               const Expr *EscapeExpr) override {
-    S.Diag(ParmWithNoescape->getBeginLoc(),
-           diag::warn_lifetime_safety_noescape_escapes)
-        << ParmWithNoescape->getSourceRange();
-
-    S.Diag(EscapeExpr->getBeginLoc(),
-           diag::note_lifetime_safety_suggestion_returned_here)
-        << EscapeExpr->getSourceRange();
-  }
-
-  void reportNoescapeViolation(const ParmVarDecl *ParmWithNoescape,
-                               const FieldDecl *EscapeField) override {
-    S.Diag(ParmWithNoescape->getBeginLoc(),
-           diag::warn_lifetime_safety_noescape_escapes)
-        << ParmWithNoescape->getSourceRange();
-
-    S.Diag(EscapeField->getLocation(),
-           diag::note_lifetime_safety_escapes_to_field_here)
-        << EscapeField->getEndLoc();
-  }
-
-  void reportNoescapeViolation(const ParmVarDecl *ParmWithNoescape,
-                               const VarDecl *EscapeGlobal) override {
-    S.Diag(ParmWithNoescape->getBeginLoc(),
-           diag::warn_lifetime_safety_noescape_escapes)
-        << ParmWithNoescape->getSourceRange();
-    if (EscapeGlobal->isStaticLocal() || EscapeGlobal->isStaticDataMember())
-      S.Diag(EscapeGlobal->getLocation(),
-             diag::note_lifetime_safety_escapes_to_static_storage_here)
-          << EscapeGlobal->getEndLoc();
-    else
-      S.Diag(EscapeGlobal->getLocation(),
-             diag::note_lifetime_safety_escapes_to_global_here)
-          << EscapeGlobal->getEndLoc();
-  }
-
-  void addLifetimeBoundToImplicitThis(const CXXMethodDecl *MD) override {
-    S.addLifetimeBoundToImplicitThis(const_cast<CXXMethodDecl *>(MD));
-  }
-
-private:
-  Sema &S;
-};
-} // namespace
-} // namespace clang::lifetimes
-
 static void
 LifetimeSafetyTUAnalysis(Sema &S, TranslationUnitDecl *TU,
                          clang::lifetimes::LifetimeSafetyStats &LSStats) {
@@ -3201,23 +3002,10 @@ void clang::sema::AnalysisBasedWarnings::IssueWarnings(
   AC.getCFGBuildOptions().AddCXXNewAllocator = false;
   AC.getCFGBuildOptions().AddCXXDefaultInitExprInCtors = true;
 
-  bool IsLifetimeSafetyDiagnosticEnabled =
-      !Diags.isIgnored(diag::warn_lifetime_safety_use_after_scope,
-                       D->getBeginLoc()) ||
-      !Diags.isIgnored(diag::warn_lifetime_safety_use_after_scope_moved,
-                       D->getBeginLoc()) ||
-      !Diags.isIgnored(diag::warn_lifetime_safety_return_stack_addr,
-                       D->getBeginLoc()) ||
-      !Diags.isIgnored(diag::warn_lifetime_safety_return_stack_addr_moved,
-                       D->getBeginLoc()) ||
-      !Diags.isIgnored(diag::warn_lifetime_safety_invalidation,
-                       D->getBeginLoc()) ||
-      !Diags.isIgnored(diag::warn_lifetime_safety_noescape_escapes,
-                       D->getBeginLoc());
   bool EnableLifetimeSafetyAnalysis =
       S.getLangOpts().EnableLifetimeSafety &&
       !S.getLangOpts().EnableLifetimeSafetyTUAnalysis &&
-      IsLifetimeSafetyDiagnosticEnabled;
+      lifetimes::IsLifetimeSafetyDiagnosticEnabled(S, D);
 
   // Force that certain expressions appear as CFGElements in the CFG.  This
   // is used to speed up various analyses.
diff --git a/clang/lib/Sema/SemaAttr.cpp b/clang/lib/Sema/SemaAttr.cpp
index 55111ca6a7cfe..7c79f954e6743 100644
--- a/clang/lib/Sema/SemaAttr.cpp
+++ b/clang/lib/Sema/SemaAttr.cpp
@@ -320,6 +320,8 @@ void Sema::inferLifetimeCaptureByAttribute(FunctionDecl *FD) {
       "insert", "insert_or_assign", "push", "push_front", "push_back"};
   if (!CapturingMethods.contains(MD->getName()))
     return;
+  if (MD->getName() == "insert" && MD->getParent()->getName() == "basic_string")
+    return;
   Annotate(MD);
 }
 
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index a3dba77284afa..9c4f52dd7150c 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -1185,8 +1185,13 @@ static bool CheckConstraintSatisfaction(
     return false;
   }
 
-  if (TemplateArgsLists.isAnyArgInstantiationDependent(S.Context)) {
-    // No need to check satisfaction for dependent constraint expressions.
+  // In the general case, we can't check satisfaction if the arguments contain
+  // unsubstituted template parameters, even if they are purely syntactic,
+  // because they may still turn out to be invalid after substitution.
+  // This could be permitted in cases where this substitution will still be
+  // attempted later and diagnosed, such as function template specializations,
+  // but that's not the case for concept specializations.
+  if (TemplateArgsLists.isAnyArgInstantiationDependent()) {
     Satisfaction.IsSatisfied = true;
     return false;
   }
@@ -2575,7 +2580,15 @@ bool Sema::IsAtLeastAsConstrained(const NamedDecl *D1,
   }
 
   SubsumptionChecker SC(*this);
-  std::optional<bool> Subsumes = SC.Subsumes(D1, AC1, D2, AC2);
+  // Associated declarations are used as a cache key in the event they were
+  // normalized earlier during concept checking. However we cannot reuse these
+  // cached results if any of the template depths have been adjusted.
+  const NamedDecl *DeclAC1 = D1, *DeclAC2 = D2;
+  if (Depth2 > Depth1)
+    DeclAC1 = nullptr;
+  else if (Depth1 > Depth2)
+    DeclAC2 = nullptr;
+  std::optional<bool> Subsumes = SC.Subsumes(DeclAC1, AC1, DeclAC2, AC2);
   if (!Subsumes) {
     // Normalization failed
     return true;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 054b664ca0a8b..ac38f32882d9b 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7942,7 +7942,7 @@ NamedDecl *Sema::ActOnVariableDeclarator(
 
     if (CurContext->isRecord()) {
       if (SC == SC_Static) {
-        if (const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(DC)) {
+        if (CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(DC)) {
           // Walk up the enclosing DeclContexts to check for any that are
           // incompatible with static data members.
           const DeclContext *FunctionOrMethod = nullptr;
@@ -7964,6 +7964,8 @@ NamedDecl *Sema::ActOnVariableDeclarator(
             Diag(D.getIdentifierLoc(),
                  diag::err_static_data_member_not_allowed_in_local_class)
                 << Name << RD->getDeclName() << RD->getTagKind();
+            Invalid = true;
+            RD->setInvalidDecl();
           } else if (AnonStruct) {
             // C++ [class.static.data]p4: Unnamed classes and classes contained
             // directly or indirectly within unnamed classes shall not contain
@@ -15138,7 +15140,10 @@ void Sema::CheckCompleteVariableDeclaration(VarDecl *var) {
 
   // If this variable must be emitted, add it as an initializer for the current
   // module.
-  if (Context.DeclMustBeEmitted(var) && !ModuleScopes.empty())
+  if (Context.DeclMustBeEmitted(var) && !ModuleScopes.empty() &&
+      (ModuleScopes.back().Module->isHeaderLikeModule() ||
+       // For named modules, we may only emit non discardable variables.
+       !isDiscardableGVALinkage(Context.GetGVALinkageForVariable(var))))
     Context.addModuleInitializer(ModuleScopes.back().Module, var);
 
   // Build the bindings if this is a structured binding declaration.
@@ -21211,9 +21216,14 @@ bool Sema::shouldIgnoreInHostDeviceCheck(FunctionDecl *Callee) {
 bool Sema::isRedefinitionAllowedFor(NamedDecl *D, NamedDecl **Suggested,
                                     bool &Visible) {
   Visible = hasVisibleDefinition(D, Suggested);
+  // Accoding to [basic.def.odr]p16, it is not allowed to have duplicated definition
+  // for declaratins which is attached to named modules.
+  // We only did this if the current module is named module as we have better
+  // diagnostics for declarations in global module and named modules.
+  if (getCurrentModule() && getCurrentModule()->isNamedModule() &&
+      D->isInNamedModule())
+    return false;
   // The redefinition of D in the **current** TU is allowed if D is invisible or
-  // D is defined in the global module of other module units. We didn't check if
-  // it is in global module as, we'll check the redefinition in named module
-  // later with better diagnostic message.
+  // D is defined in the global module of other module units.
   return D->isInAnotherModuleUnit() || !Visible;
 }
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 6fc749464586d..8da694d09e4b4 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -708,20 +708,9 @@ static void handleExcludeFromExplicitInstantiationAttr(Sema &S, Decl *D,
   }
 
   if (auto *DA = getDLLAttr(D); DA && !DA->isInherited()) {
-    if (auto *RD = dyn_cast<CXXRecordDecl>(D->getDeclContext())) {
-      if (RD->isTemplated()) {
-        S.Diag(DA->getLoc(),
-               diag::warn_dllattr_ignored_exclusion_takes_precedence)
-            << DA << AL;
-        D->dropAttrs<DLLExportAttr, DLLImportAttr>();
-      } else {
-        S.Diag(AL.getLoc(), diag::warn_attribute_ignored_in_non_template) << AL;
-        return;
-      }
-    } else {
-      S.Diag(AL.getLoc(), diag::warn_attribute_ignored_on_non_member) << AL;
-      return;
-    }
+    S.Diag(DA->getLoc(), diag::warn_dllattr_ignored_exclusion_takes_precedence)
+        << DA << AL;
+    D->dropAttrs<DLLExportAttr, DLLImportAttr>();
   }
 
   D->addAttr(::new (S.Context)
@@ -6489,19 +6478,10 @@ static void handleDLLAttr(Sema &S, Decl *D, const ParsedAttr &A) {
   }
 
   if (auto *EA = D->getAttr<ExcludeFromExplicitInstantiationAttr>()) {
-    if (auto *RD = dyn_cast<CXXRecordDecl>(D->getDeclContext())) {
-      if (RD->isTemplated()) {
-        S.Diag(A.getRange().getBegin(),
-               diag::warn_dllattr_ignored_exclusion_takes_precedence)
-            << A << EA;
-        return;
-      }
-      S.Diag(EA->getLoc(), diag::warn_attribute_ignored_in_non_template) << EA;
-      D->dropAttr<ExcludeFromExplicitInstantiationAttr>();
-    } else {
-      S.Diag(EA->getLoc(), diag::warn_attribute_ignored_on_non_member) << EA;
-      D->dropAttr<ExcludeFromExplicitInstantiationAttr>();
-    }
+    S.Diag(A.getRange().getBegin(),
+           diag::warn_dllattr_ignored_exclusion_takes_precedence)
+        << A << EA;
+    return;
   }
 
   Attr *NewAttr = A.getKind() == ParsedAttr::AT_DLLExport
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 2ae6e5de0e3ee..2a24ee42bcb4d 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -6595,18 +6595,28 @@ void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) {
     for (Decl *D : Class->decls())
       if (auto *S = dyn_cast<ConstructorUsingShadowDecl>(D))
         Shadows.push_back(S);
-    for (ConstructorUsingShadowDecl *S : Shadows)
-      if (auto *BC = dyn_cast<CXXConstructorDecl>(S->getTargetDecl());
-          BC && !BC->isDeleted())
-        findInheritingConstructor(Class->getLocation(), BC, S);
+    for (ConstructorUsingShadowDecl *S : Shadows) {
+      CXXConstructorDecl *BC = dyn_cast<CXXConstructorDecl>(S->getTargetDecl());
+      if (!BC || BC->isDeleted())
+        continue;
+      // Skip constructors whose requires clause is not satisfied.
+      // Normally overload resolution filters these, but we are bypassing
+      // it to eagerly create inherited constructors for dllexport.
+      if (BC->getTrailingRequiresClause()) {
+        ConstraintSatisfaction Satisfaction;
+        if (CheckFunctionConstraints(BC, Satisfaction) ||
+            !Satisfaction.IsSatisfied)
+          continue;
+      }
+      findInheritingConstructor(Class->getLocation(), BC, S);
+    }
   }
 
   // FIXME: MSVC's docs say all bases must be exportable, but this doesn't
   // seem to be true in practice?
 
   for (Decl *Member : Class->decls()) {
-    if (isTemplateInstantiation(TSK) &&
-        Member->hasAttr<ExcludeFromExplicitInstantiationAttr>())
+    if (Member->hasAttr<ExcludeFromExplicitInstantiationAttr>())
       continue;
 
     VarDecl *VD = dyn_cast<VarDecl>(Member);
@@ -6621,38 +6631,53 @@ void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) {
       if (MD->isDeleted())
         continue;
 
-      // Don't export inherited constructors whose parameters prevent ABI-
-      // compatible forwarding. When canEmitDelegateCallArgs (in CodeGen)
-      // returns false, Clang inlines the constructor body instead of
-      // emitting a forwarding thunk, producing code that is not ABI-
-      // compatible with MSVC. Suppress the export and warn so the user
-      // gets a linker error rather than a silent runtime mismatch.
       if (ClassExported) {
-        if (auto *CD = dyn_cast<CXXConstructorDecl>(MD)) {
-          if (CD->getInheritedConstructor()) {
-            if (CD->isVariadic()) {
+        CXXConstructorDecl *CD = dyn_cast<CXXConstructorDecl>(MD);
+        if (CD && CD->getInheritedConstructor()) {
+          // Inherited constructors already had their base constructor's
+          // constraints checked before creation via
+          // findInheritingConstructor, so only ABI-compatibility checks
+          // are needed here.
+          //
+          // Don't export inherited constructors whose parameters prevent
+          // ABI-compatible forwarding. When canEmitDelegateCallArgs (in
+          // CodeGen) returns false, Clang inlines the constructor body
+          // instead of emitting a forwarding thunk, producing code that
+          // is not ABI-compatible with MSVC. Suppress the export and warn
+          // so the user gets a linker error rather than a silent runtime
+          // mismatch.
+          if (CD->isVariadic()) {
+            Diag(CD->getLocation(),
+                 diag::warn_dllexport_inherited_ctor_unsupported)
+                << /*variadic=*/0;
+            continue;
+          }
+          if (Context.getTargetInfo()
+                  .getCXXABI()
+                  .areArgsDestroyedLeftToRightInCallee()) {
+            bool HasCalleeCleanupParam = false;
+            for (const ParmVarDecl *P : CD->parameters())
+              if (P->needsDestruction(Context)) {
+                HasCalleeCleanupParam = true;
+                break;
+              }
+            if (HasCalleeCleanupParam) {
               Diag(CD->getLocation(),
                    diag::warn_dllexport_inherited_ctor_unsupported)
-                  << /*variadic=*/0;
+                  << /*callee-cleanup=*/1;
               continue;
             }
-            if (Context.getTargetInfo()
-                    .getCXXABI()
-                    .areArgsDestroyedLeftToRightInCallee()) {
-              bool HasCalleeCleanupParam = false;
-              for (const auto *P : CD->parameters())
-                if (P->needsDestruction(Context)) {
-                  HasCalleeCleanupParam = true;
-                  break;
-                }
-              if (HasCalleeCleanupParam) {
-                Diag(CD->getLocation(),
-                     diag::warn_dllexport_inherited_ctor_unsupported)
-                    << /*callee-cleanup=*/1;
-                continue;
-              }
-            }
           }
+        } else if (MD->getTrailingRequiresClause()) {
+          // Don't export methods whose requires clause is not satisfied.
+          // For class template specializations, member constraints may
+          // depend on template arguments and an unsatisfied constraint
+          // means the member should not be available in this
+          // specialization.
+          ConstraintSatisfaction Satisfaction;
+          if (CheckFunctionConstraints(MD, Satisfaction) ||
+              !Satisfaction.IsSatisfied)
+            continue;
         }
       }
 
@@ -11248,6 +11273,7 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
 
       if (Context.getTargetInfo().emitVectorDeletingDtors(
               Context.getLangOpts())) {
+        bool DestructorIsExported = Destructor->hasAttr<DLLExportAttr>();
         // Lookup delete[] too in case we have to emit a vector deleting dtor.
         DeclarationName VDeleteName =
             Context.DeclarationNames.getCXXOperatorName(OO_Array_Delete);
@@ -11261,7 +11287,8 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
                                                     VDeleteName);
           Destructor->setGlobalOperatorArrayDelete(GlobalArrOperatorDelete);
           if (GlobalArrOperatorDelete &&
-              Context.classNeedsVectorDeletingDestructor(RD))
+              (Context.classMaybeNeedsVectorDeletingDestructor(RD) ||
+               DestructorIsExported))
             MarkFunctionReferenced(Loc, GlobalArrOperatorDelete);
         } else if (!ArrOperatorDelete) {
           ArrOperatorDelete = FindDeallocationFunctionForDestructor(
@@ -11269,7 +11296,9 @@ bool Sema::CheckDestructor(CXXDestructorDecl *Destructor) {
               /*LookForGlobal*/ true, VDeleteName);
         }
         Destructor->setOperatorArrayDelete(ArrOperatorDelete);
-        if (ArrOperatorDelete && Context.classNeedsVectorDeletingDestructor(RD))
+        if (ArrOperatorDelete &&
+            (Context.classMaybeNeedsVectorDeletingDestructor(RD) ||
+             DestructorIsExported))
           MarkFunctionReferenced(Loc, ArrOperatorDelete);
       }
     }
@@ -19108,6 +19137,8 @@ void Sema::MarkVTableUsed(SourceLocation Loc, CXXRecordDecl *Class,
           // delete().
           ContextRAII SavedContext(*this, DD);
           CheckDestructor(DD);
+          if (!DD->getOperatorDelete())
+            DD->setInvalidDecl();
         } else {
           MarkFunctionReferenced(Loc, Class->getDestructor());
         }
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 5a5bbf4d900dc..5de4a1e7475f2 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -2636,17 +2636,31 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
     MarkFunctionReferenced(StartLoc, OperatorDelete);
   }
 
-  // For MSVC vector deleting destructors support we record that for the class
-  // new[] was called. We try to optimize the code size and only emit vector
-  // deleting destructors when they are required. Vector deleting destructors
-  // are required for delete[] call but MSVC triggers emission of them
-  // whenever new[] is called for an object of the class and we do the same
-  // for compatibility.
-  if (const CXXConstructExpr *CCE =
-          dyn_cast_or_null<CXXConstructExpr>(Initializer);
-      CCE && ArraySize) {
-    Context.setClassNeedsVectorDeletingDestructor(
-        CCE->getConstructor()->getParent());
+  // new[] will trigger vector deleting destructor emission if the class has
+  // virtual destructor for MSVC compatibility. Perform necessary checks.
+  if (Context.getTargetInfo().emitVectorDeletingDtors(Context.getLangOpts())) {
+    if (const CXXConstructExpr *CCE =
+            dyn_cast_or_null<CXXConstructExpr>(Initializer);
+        CCE && ArraySize) {
+      CXXRecordDecl *ClassDecl = CCE->getConstructor()->getParent();
+      // We probably already did this for another new[] with this class so don't
+      // do it twice.
+      if (!Context.classMaybeNeedsVectorDeletingDestructor(ClassDecl)) {
+        auto *Dtor = ClassDecl->getDestructor();
+        if (Dtor && Dtor->isVirtual() && !Dtor->isDeleted()) {
+          Context.setClassMaybeNeedsVectorDeletingDestructor(ClassDecl);
+          if (!Dtor->isDefined() && !Dtor->isInvalidDecl()) {
+            // Call CheckDestructor if destructor is not defined. This is
+            // needed to find operators delete and delete[] for vector deleting
+            // destructor body because new[] will trigger emission of vector
+            // deleting destructor body even if destructor is defined in another
+            // translation unit.
+            ContextRAII SavedContext(*this, Dtor);
+            CheckDestructor(Dtor);
+          }
+        }
+      }
+    }
   }
 
   return CXXNewExpr::Create(Context, UseGlobal, OperatorNew, OperatorDelete,
diff --git a/clang/lib/Sema/SemaLifetimeSafety.h b/clang/lib/Sema/SemaLifetimeSafety.h
new file mode 100644
index 0000000000000..e6f7e3d929f61
--- /dev/null
+++ b/clang/lib/Sema/SemaLifetimeSafety.h
@@ -0,0 +1,238 @@
+//===--- SemaLifetimeSafety.h - Sema support for lifetime safety =---------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines the Sema-specific implementation for lifetime safety
+//  analysis. It provides diagnostic reporting and helper functions that bridge
+//  the lifetime safety analysis framework with Sema's diagnostic engine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_SEMA_SEMALIFETIMESAFETY_H
+#define LLVM_CLANG_LIB_SEMA_SEMALIFETIMESAFETY_H
+
+#include "clang/Analysis/Analyses/LifetimeSafety/LifetimeSafety.h"
+#include "clang/Basic/DiagnosticSema.h"
+#include "clang/Lex/Lexer.h"
+#include "clang/Sema/Sema.h"
+
+namespace clang::lifetimes {
+
+inline bool IsLifetimeSafetyDiagnosticEnabled(Sema &S, const Decl *D) {
+  DiagnosticsEngine &Diags = S.getDiagnostics();
+  return !Diags.isIgnored(diag::warn_lifetime_safety_use_after_scope,
+                          D->getBeginLoc()) ||
+         !Diags.isIgnored(diag::warn_lifetime_safety_use_after_scope_moved,
+                          D->getBeginLoc()) ||
+         !Diags.isIgnored(diag::warn_lifetime_safety_return_stack_addr,
+                          D->getBeginLoc()) ||
+         !Diags.isIgnored(diag::warn_lifetime_safety_return_stack_addr_moved,
+                          D->getBeginLoc()) ||
+         !Diags.isIgnored(diag::warn_lifetime_safety_invalidation,
+                          D->getBeginLoc()) ||
+         !Diags.isIgnored(diag::warn_lifetime_safety_noescape_escapes,
+                          D->getBeginLoc());
+}
+
+class LifetimeSafetySemaHelperImpl : public LifetimeSafetySemaHelper {
+
+public:
+  LifetimeSafetySemaHelperImpl(Sema &S) : S(S) {}
+
+  void reportUseAfterFree(const Expr *IssueExpr, const Expr *UseExpr,
+                          const Expr *MovedExpr,
+                          SourceLocation FreeLoc) override {
+    S.Diag(IssueExpr->getExprLoc(),
+           MovedExpr ? diag::warn_lifetime_safety_use_after_scope_moved
+                     : diag::warn_lifetime_safety_use_after_scope)
+        << IssueExpr->getSourceRange();
+    if (MovedExpr)
+      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
+          << MovedExpr->getSourceRange();
+    S.Diag(FreeLoc, diag::note_lifetime_safety_destroyed_here);
+    S.Diag(UseExpr->getExprLoc(), diag::note_lifetime_safety_used_here)
+        << UseExpr->getSourceRange();
+  }
+
+  void reportUseAfterReturn(const Expr *IssueExpr, const Expr *ReturnExpr,
+                            const Expr *MovedExpr,
+                            SourceLocation ExpiryLoc) override {
+    S.Diag(IssueExpr->getExprLoc(),
+           MovedExpr ? diag::warn_lifetime_safety_return_stack_addr_moved
+                     : diag::warn_lifetime_safety_return_stack_addr)
+        << IssueExpr->getSourceRange();
+    if (MovedExpr)
+      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
+          << MovedExpr->getSourceRange();
+    S.Diag(ReturnExpr->getExprLoc(), diag::note_lifetime_safety_returned_here)
+        << ReturnExpr->getSourceRange();
+  }
+
+  void reportDanglingField(const Expr *IssueExpr,
+                           const FieldDecl *DanglingField,
+                           const Expr *MovedExpr,
+                           SourceLocation ExpiryLoc) override {
+    S.Diag(IssueExpr->getExprLoc(),
+           MovedExpr ? diag::warn_lifetime_safety_dangling_field_moved
+                     : diag::warn_lifetime_safety_dangling_field)
+        << IssueExpr->getSourceRange();
+    if (MovedExpr)
+      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
+          << MovedExpr->getSourceRange();
+    S.Diag(DanglingField->getLocation(),
+           diag::note_lifetime_safety_dangling_field_here)
+        << DanglingField->getEndLoc();
+  }
+
+  void reportDanglingGlobal(const Expr *IssueExpr,
+                            const VarDecl *DanglingGlobal,
+                            const Expr *MovedExpr,
+                            SourceLocation ExpiryLoc) override {
+    S.Diag(IssueExpr->getExprLoc(),
+           MovedExpr ? diag::warn_lifetime_safety_dangling_global_moved
+                     : diag::warn_lifetime_safety_dangling_global)
+        << IssueExpr->getSourceRange();
+    if (MovedExpr)
+      S.Diag(MovedExpr->getExprLoc(), diag::note_lifetime_safety_moved_here)
+          << MovedExpr->getSourceRange();
+    if (DanglingGlobal->isStaticLocal() || DanglingGlobal->isStaticDataMember())
+      S.Diag(DanglingGlobal->getLocation(),
+             diag::note_lifetime_safety_dangling_static_here)
+          << DanglingGlobal->getEndLoc();
+    else
+      S.Diag(DanglingGlobal->getLocation(),
+             diag::note_lifetime_safety_dangling_global_here)
+          << DanglingGlobal->getEndLoc();
+  }
+
+  void reportUseAfterInvalidation(const Expr *IssueExpr, const Expr *UseExpr,
+                                  const Expr *InvalidationExpr) override {
+    S.Diag(IssueExpr->getExprLoc(), diag::warn_lifetime_safety_invalidation)
+        << false << IssueExpr->getSourceRange();
+    S.Diag(InvalidationExpr->getExprLoc(),
+           diag::note_lifetime_safety_invalidated_here)
+        << InvalidationExpr->getSourceRange();
+    S.Diag(UseExpr->getExprLoc(), diag::note_lifetime_safety_used_here)
+        << UseExpr->getSourceRange();
+  }
+  void reportUseAfterInvalidation(const ParmVarDecl *PVD, const Expr *UseExpr,
+                                  const Expr *InvalidationExpr) override {
+    S.Diag(PVD->getSourceRange().getBegin(),
+           diag::warn_lifetime_safety_invalidation)
+        << true << PVD->getSourceRange();
+    S.Diag(InvalidationExpr->getExprLoc(),
+           diag::note_lifetime_safety_invalidated_here)
+        << InvalidationExpr->getSourceRange();
+    S.Diag(UseExpr->getExprLoc(), diag::note_lifetime_safety_used_here)
+        << UseExpr->getSourceRange();
+  }
+
+  void suggestLifetimeboundToParmVar(SuggestionScope Scope,
+                                     const ParmVarDecl *ParmToAnnotate,
+                                     const Expr *EscapeExpr) override {
+    unsigned DiagID =
+        (Scope == SuggestionScope::CrossTU)
+            ? diag::warn_lifetime_safety_cross_tu_param_suggestion
+            : diag::warn_lifetime_safety_intra_tu_param_suggestion;
+    SourceLocation InsertionPoint = Lexer::getLocForEndOfToken(
+        ParmToAnnotate->getEndLoc(), 0, S.getSourceManager(), S.getLangOpts());
+    StringRef FixItText = " [[clang::lifetimebound]]";
+    if (!ParmToAnnotate->getIdentifier()) {
+      // For unnamed parameters, placing attributes after the type would be
+      // parsed as a type attribute, not a parameter attribute.
+      InsertionPoint = ParmToAnnotate->getBeginLoc();
+      FixItText = "[[clang::lifetimebound]] ";
+    }
+    S.Diag(ParmToAnnotate->getBeginLoc(), DiagID)
+        << ParmToAnnotate->getSourceRange()
+        << FixItHint::CreateInsertion(InsertionPoint, FixItText);
+    S.Diag(EscapeExpr->getBeginLoc(),
+           diag::note_lifetime_safety_suggestion_returned_here)
+        << EscapeExpr->getSourceRange();
+  }
+
+  void suggestLifetimeboundToImplicitThis(SuggestionScope Scope,
+                                          const CXXMethodDecl *MD,
+                                          const Expr *EscapeExpr) override {
+    unsigned DiagID = (Scope == SuggestionScope::CrossTU)
+                          ? diag::warn_lifetime_safety_cross_tu_this_suggestion
+                          : diag::warn_lifetime_safety_intra_tu_this_suggestion;
+    const auto MDL = MD->getTypeSourceInfo()->getTypeLoc();
+    SourceLocation InsertionPoint = Lexer::getLocForEndOfToken(
+        MDL.getEndLoc(), 0, S.getSourceManager(), S.getLangOpts());
+    if (const auto *FPT = MD->getType()->getAs<FunctionProtoType>();
+        FPT && FPT->hasTrailingReturn()) {
+      // For trailing return types, 'getEndLoc()' includes the return type
+      // after '->', placing the attribute in an invalid position.
+      // Instead use 'getLocalRangeEnd()' which gives the '->' location
+      // for trailing returns, so find the last token before it.
+      const auto FTL = MDL.getAs<FunctionTypeLoc>();
+      assert(FTL);
+      InsertionPoint = Lexer::getLocForEndOfToken(
+          Lexer::findPreviousToken(FTL.getLocalRangeEnd(), S.getSourceManager(),
+                                   S.getLangOpts(),
+                                   /*IncludeComments=*/false)
+              ->getLocation(),
+          0, S.getSourceManager(), S.getLangOpts());
+    }
+    S.Diag(InsertionPoint, DiagID)
+        << MD->getNameInfo().getSourceRange()
+        << FixItHint::CreateInsertion(InsertionPoint,
+                                      " [[clang::lifetimebound]]");
+    S.Diag(EscapeExpr->getBeginLoc(),
+           diag::note_lifetime_safety_suggestion_returned_here)
+        << EscapeExpr->getSourceRange();
+  }
+
+  void reportNoescapeViolation(const ParmVarDecl *ParmWithNoescape,
+                               const Expr *EscapeExpr) override {
+    S.Diag(ParmWithNoescape->getBeginLoc(),
+           diag::warn_lifetime_safety_noescape_escapes)
+        << ParmWithNoescape->getSourceRange();
+
+    S.Diag(EscapeExpr->getBeginLoc(),
+           diag::note_lifetime_safety_suggestion_returned_here)
+        << EscapeExpr->getSourceRange();
+  }
+
+  void reportNoescapeViolation(const ParmVarDecl *ParmWithNoescape,
+                               const FieldDecl *EscapeField) override {
+    S.Diag(ParmWithNoescape->getBeginLoc(),
+           diag::warn_lifetime_safety_noescape_escapes)
+        << ParmWithNoescape->getSourceRange();
+
+    S.Diag(EscapeField->getLocation(),
+           diag::note_lifetime_safety_escapes_to_field_here)
+        << EscapeField->getEndLoc();
+  }
+
+  void reportNoescapeViolation(const ParmVarDecl *ParmWithNoescape,
+                               const VarDecl *EscapeGlobal) override {
+    S.Diag(ParmWithNoescape->getBeginLoc(),
+           diag::warn_lifetime_safety_noescape_escapes)
+        << ParmWithNoescape->getSourceRange();
+    if (EscapeGlobal->isStaticLocal() || EscapeGlobal->isStaticDataMember())
+      S.Diag(EscapeGlobal->getLocation(),
+             diag::note_lifetime_safety_escapes_to_static_storage_here)
+          << EscapeGlobal->getEndLoc();
+    else
+      S.Diag(EscapeGlobal->getLocation(),
+             diag::note_lifetime_safety_escapes_to_global_here)
+          << EscapeGlobal->getEndLoc();
+  }
+
+  void addLifetimeBoundToImplicitThis(const CXXMethodDecl *MD) override {
+    S.addLifetimeBoundToImplicitThis(const_cast<CXXMethodDecl *>(MD));
+  }
+
+private:
+  Sema &S;
+};
+
+} // namespace clang::lifetimes
+
+#endif // LLVM_CLANG_LIB_SEMA_SEMALIFETIMESAFETY_H
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index ce0a092fffb42..34869e50b74ac 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -24453,6 +24453,14 @@ void SemaOpenMP::ActOnOpenMPDeclareTargetName(
   if (getLangOpts().HIP)
     Diag(Loc, diag::warn_hip_omp_target_directives);
 
+  // 'local' is incompatible with 'device_type(host)' because 'local'
+  // variables exist only on the device.
+  if (MT == OMPDeclareTargetDeclAttr::MT_Local &&
+      DTCI.DT == OMPDeclareTargetDeclAttr::DT_Host) {
+    Diag(Loc, diag::err_omp_declare_target_local_host_only);
+    return;
+  }
+
   // Explicit declare target lists have precedence.
   const unsigned Level = -1;
 
@@ -24469,7 +24477,11 @@ void SemaOpenMP::ActOnOpenMPDeclareTargetName(
   }
   if (ActiveAttr && (*ActiveAttr)->getMapType() != MT &&
       (*ActiveAttr)->getLevel() == Level) {
-    Diag(Loc, diag::err_omp_declare_target_to_and_link) << ND;
+    Diag(Loc, diag::err_omp_declare_target_var_in_both_clauses)
+        << ND
+        << OMPDeclareTargetDeclAttr::ConvertMapTypeTyToStr(
+               (*ActiveAttr)->getMapType())
+        << OMPDeclareTargetDeclAttr::ConvertMapTypeTyToStr(MT);
     return;
   }
 
@@ -24483,6 +24495,11 @@ void SemaOpenMP::ActOnOpenMPDeclareTargetName(
     if (!IndirectE)
       IsIndirect = true;
   }
+  // FIXME: 'local' clause is not yet implemented in CodeGen. For now, it is
+  // treated as 'enter'. For host compilation, 'local' is a no-op.
+  if (MT == OMPDeclareTargetDeclAttr::MT_Local &&
+      getLangOpts().OpenMPIsTargetDevice)
+    Diag(Loc, diag::warn_omp_declare_target_local_not_implemented);
   auto *A = OMPDeclareTargetDeclAttr::CreateImplicit(
       getASTContext(), MT, DTCI.DT, IndirectE, IsIndirect, Level,
       SourceRange(Loc, Loc));
@@ -24508,7 +24525,8 @@ static void checkDeclInTargetContext(SourceLocation SL, SourceRange SR,
        SemaRef.getCurBlock() || SemaRef.getCurCapturedRegion()) &&
       VD->hasGlobalStorage()) {
     if (!MapTy || (*MapTy != OMPDeclareTargetDeclAttr::MT_To &&
-                   *MapTy != OMPDeclareTargetDeclAttr::MT_Enter)) {
+                   *MapTy != OMPDeclareTargetDeclAttr::MT_Enter &&
+                   *MapTy != OMPDeclareTargetDeclAttr::MT_Local)) {
       // OpenMP 5.0, 2.12.7 declare target Directive, Restrictions
       // If a lambda declaration and definition appears between a
       // declare target directive and the matching end declare target
@@ -24559,8 +24577,11 @@ void SemaOpenMP::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D,
   if (auto *FD = dyn_cast<FunctionDecl>(D)) {
     std::optional<OMPDeclareTargetDeclAttr::MapTypeTy> Res =
         OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(FD);
-    if (IdLoc.isValid() && Res && *Res == OMPDeclareTargetDeclAttr::MT_Link) {
-      Diag(IdLoc, diag::err_omp_function_in_link_clause);
+    if (IdLoc.isValid() && Res &&
+        (*Res == OMPDeclareTargetDeclAttr::MT_Link ||
+         *Res == OMPDeclareTargetDeclAttr::MT_Local)) {
+      Diag(IdLoc, diag::err_omp_function_in_target_clause_list)
+          << OMPDeclareTargetDeclAttr::ConvertMapTypeTyToStr(*Res);
       Diag(FD->getLocation(), diag::note_defined_here) << FD;
       return;
     }
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 97018dbe81057..1ca340e8b72c7 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -12841,7 +12841,9 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
 
       S.Diag(Fn->getLocation(), diag::note_ovl_candidate_deleted)
           << (unsigned)FnKindPair.first << (unsigned)FnKindPair.second << FnDesc
-          << (Fn->isDeleted() ? (Fn->isDeletedAsWritten() ? 1 : 2) : 0);
+          << (Fn->isDeleted()
+                  ? (Fn->getCanonicalDecl()->isDeletedAsWritten() ? 1 : 2)
+                  : 0);
       MaybeEmitInheritedConstructorNote(S, Cand->FoundDecl);
       return;
     }
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 9b0bec20618a0..9194ee5e2bee9 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1342,7 +1342,10 @@ QualType Sema::CheckNonTypeTemplateParameterType(TypeSourceInfo *&TSI,
     //    - an identifier associated by name lookup with a non-type
     //      template-parameter declared with a type that contains a
     //      placeholder type (7.1.7.4),
-    TSI = SubstAutoTypeSourceInfoDependent(TSI);
+    TypeSourceInfo *NewTSI = SubstAutoTypeSourceInfoDependent(TSI);
+    if (!NewTSI)
+      return QualType();
+    TSI = NewTSI;
   }
 
   return CheckNonTypeTemplateParameterType(TSI->getType(), Loc);
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 40892f232e603..477af31def50e 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3972,12 +3972,11 @@ TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     if (CheckFunctionTemplateConstraints(
             Info.getLocation(),
             FunctionTemplate->getCanonicalDecl()->getTemplatedDecl(),
-            CTAI.SugaredConverted, Info.AssociatedConstraintsSatisfaction))
+            CTAI.CanonicalConverted, Info.AssociatedConstraintsSatisfaction))
       return TemplateDeductionResult::MiscellaneousDeductionFailure;
     if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
-      Info.reset(
-          TemplateArgumentList::CreateCopy(Context, CTAI.SugaredConverted),
-          Info.takeCanonical());
+      Info.reset(Info.takeSugared(), TemplateArgumentList::CreateCopy(
+                                         Context, CTAI.CanonicalConverted));
       return TemplateDeductionResult::ConstraintsNotSatisfied;
     }
   }
diff --git a/clang/lib/StaticAnalyzer/Core/BugSuppression.cpp b/clang/lib/StaticAnalyzer/Core/BugSuppression.cpp
index f3e9b0929f9c4..0e7b5b77a7e54 100644
--- a/clang/lib/StaticAnalyzer/Core/BugSuppression.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugSuppression.cpp
@@ -9,6 +9,7 @@
 #include "clang/StaticAnalyzer/Core/BugReporter/BugSuppression.h"
 #include "clang/AST/DynamicRecursiveASTVisitor.h"
 #include "clang/StaticAnalyzer/Core/BugReporter/BugReporter.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/TimeProfiler.h"
 
@@ -165,6 +166,55 @@ bool BugSuppression::isSuppressed(const BugReport &R) {
          isSuppressed(UniqueingLocation, DeclWithIssue, {});
 }
 
+static const ClassTemplateDecl *
+walkInstantiatedFromChain(const ClassTemplateDecl *Tmpl) {
+  // For nested member templates (e.g., S2 inside S1<T>), getInstantiatedFrom
+  // may return the member template as instantiated within an outer
+  // specialization (e.g., S2 as it appears in S1<int>).  That instantiated
+  // member template has no definition redeclaration itself; we need to walk
+  // up the member template chain to reach the primary template definition.
+  // \code
+  //   template <class> struct S1 {
+  //     template <class> struct S2 {
+  //       int i;
+  //       template <class T> int m(const S2<T>& s2) {
+  //         return s2.i;
+  //       }
+  //     };
+  //   }
+  // /code
+  const ClassTemplateDecl *MemberTmpl;
+  while ((MemberTmpl = Tmpl->getInstantiatedFromMemberTemplate())) {
+    if (Tmpl->isMemberSpecialization())
+      break;
+    Tmpl = MemberTmpl;
+  }
+  return Tmpl;
+}
+
+static const ClassTemplatePartialSpecializationDecl *walkInstantiatedFromChain(
+    const ClassTemplatePartialSpecializationDecl *PartialSpec) {
+  const ClassTemplatePartialSpecializationDecl *MemberPS;
+  while ((MemberPS = PartialSpec->getInstantiatedFromMember())) {
+    if (PartialSpec->isMemberSpecialization())
+      break;
+    PartialSpec = MemberPS;
+  }
+  return PartialSpec;
+}
+
+template <class T> static const T *chooseDefinitionRedecl(const T *Tmpl) {
+  static_assert(llvm::is_one_of<T, ClassTemplateDecl,
+                                ClassTemplatePartialSpecializationDecl>::value);
+  for (const auto *Redecl : Tmpl->redecls()) {
+    if (const T *D = cast<T>(Redecl); D->isThisDeclarationADefinition()) {
+      return D;
+    }
+  }
+  assert(false && "This template must have a redecl that is a definition");
+  return Tmpl;
+}
+
 // For template specializations, returns the primary template definition or
 // partial specialization that was used to instantiate the specialization.
 // This ensures suppression attributes on templates apply to their
@@ -178,12 +228,8 @@ bool BugSuppression::isSuppressed(const BugReport &R) {
 // back to the primary template definition, allowing us to find the suppression
 // attribute.
 //
-// The function handles two cases:
-// 1. Instantiation from a class template - searches redeclarations to find
-//    the definition (not just a forward declaration).
-// 2. Instantiation from a partial specialization - returns it directly.
-//
-// For non-template-specialization decls, returns the input unchanged.
+// The function handles specializations (and partial specializations) of
+// class templates. For any other decl, it returns the input unchagned.
 static const Decl *
 preferTemplateDefinitionForTemplateSpecializations(const Decl *D) {
   const auto *SpecializationDecl = dyn_cast<ClassTemplateSpecializationDecl>(D);
@@ -194,27 +240,13 @@ preferTemplateDefinitionForTemplateSpecializations(const Decl *D) {
   if (!InstantiatedFrom)
     return D;
 
-  // This might be a class template.
   if (const auto *Tmpl = InstantiatedFrom.dyn_cast<ClassTemplateDecl *>()) {
     // Interestingly, the source template might be a forward declaration, so we
     // need to find the definition redeclaration.
-    for (const auto *Redecl : Tmpl->redecls()) {
-      if (cast<ClassTemplateDecl>(Redecl)->isThisDeclarationADefinition()) {
-        return Redecl;
-      }
-    }
-    assert(false &&
-           "This class template must have a redecl that is a definition");
-    return D;
+    return chooseDefinitionRedecl(walkInstantiatedFromChain(Tmpl));
   }
-
-  // It might be a partial specialization.
-  const auto *PartialSpecialization =
-      InstantiatedFrom.dyn_cast<ClassTemplatePartialSpecializationDecl *>();
-
-  // The partial specialization should be a definition.
-  assert(PartialSpecialization->isThisDeclarationADefinition());
-  return PartialSpecialization;
+  return chooseDefinitionRedecl(walkInstantiatedFromChain(
+      cast<ClassTemplatePartialSpecializationDecl *>(InstantiatedFrom)));
 }
 
 bool BugSuppression::isSuppressed(const PathDiagnosticLocation &Location,
diff --git a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
index f6c3a8e3266da..86ffd92cdf6f5 100644
--- a/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CallEvent.cpp
@@ -562,8 +562,12 @@ std::optional<SVal> CallEvent::getReturnValueUnderConstruction() const {
 
   EvalCallOptions CallOpts;
   ExprEngine &Engine = getState()->getStateManager().getOwningEngine();
+  // FIXME: This code assumes that the _current_ location context and block is
+  // the location and block where this `CallExpr` is called. For a more stable
+  // solution `Engine.getNumVisitedCurrent()` should be replaced with a call to
+  // `Engine.getNumVisited(<CallerLCtx>, <CallerBlock>)`.
   SVal RetVal = Engine.computeObjectUnderConstruction(
-      getOriginExpr(), getState(), &Engine.getBuilderContext(),
+      getOriginExpr(), getState(), Engine.getNumVisitedCurrent(),
       getLocationContext(), CC, CallOpts);
   return RetVal;
 }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index ad419bbca50e6..30aee25d35dea 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -2562,7 +2562,7 @@ void ExprEngine::processCFGBlockEntrance(const BlockEdge &L,
   // other constraints) then consider completely unrolling it.
   if(AMgr.options.ShouldUnrollLoops) {
     unsigned maxBlockVisitOnPath = AMgr.options.maxBlockVisitOnPath;
-    const Stmt *Term = Builder.getContext().getBlock()->getTerminatorStmt();
+    const Stmt *Term = getCurrBlock()->getTerminatorStmt();
     if (Term) {
       ProgramStateRef NewState = updateLoopStack(Term, AMgr.getASTContext(),
                                                  Pred, maxBlockVisitOnPath);
@@ -2580,10 +2580,10 @@ void ExprEngine::processCFGBlockEntrance(const BlockEdge &L,
 
   // If this block is terminated by a loop and it has already been visited the
   // maximum number of times, widen the loop.
-  unsigned int BlockCount = Builder.getContext().blockCount();
+  unsigned int BlockCount = getNumVisitedCurrent();
   if (BlockCount == AMgr.options.maxBlockVisitOnPath - 1 &&
       AMgr.options.ShouldWidenLoops) {
-    const Stmt *Term = Builder.getContext().getBlock()->getTerminatorStmt();
+    const Stmt *Term = getCurrBlock()->getTerminatorStmt();
     if (!isa_and_nonnull<ForStmt, WhileStmt, DoStmt, CXXForRangeStmt>(Term))
       return;
 
@@ -2596,9 +2596,8 @@ void ExprEngine::processCFGBlockEntrance(const BlockEdge &L,
     // would be stale.  Ideally, we should pass on the terminator of the CFG
     // block, but the terminator cannot be referred as a CFG element.
     // Here we just pass the the first CFG element in the block.
-    ProgramStateRef WidenedState =
-        getWidenedLoopState(Pred->getState(), LCtx, BlockCount,
-                            *Builder.getContext().getBlock()->ref_begin());
+    ProgramStateRef WidenedState = getWidenedLoopState(
+        Pred->getState(), LCtx, BlockCount, *getCurrBlock()->ref_begin());
     Builder.generateNode(BE, WidenedState, Pred);
     return;
   }
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
index 0866dda766667..cf22b62225f2f 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCXX.cpp
@@ -127,9 +127,8 @@ SVal ExprEngine::makeElementRegion(ProgramStateRef State, SVal LValue,
 // In case when the prvalue is returned from the function (kind is one of
 // SimpleReturnedValueKind, CXX17ElidedCopyReturnedValueKind), then
 // it's materialization happens in context of the caller.
-// We pass BldrCtx explicitly, as currBldrCtx always refers to callee's context.
 SVal ExprEngine::computeObjectUnderConstruction(
-    const Expr *E, ProgramStateRef State, const NodeBuilderContext *BldrCtx,
+    const Expr *E, ProgramStateRef State, unsigned NumVisitedCaller,
     const LocationContext *LCtx, const ConstructionContext *CC,
     EvalCallOptions &CallOpts, unsigned Idx) {
 
@@ -230,10 +229,9 @@ SVal ExprEngine::computeObjectUnderConstruction(
           assert(!isa<BlockInvocationContext>(CallerLCtx));
         }
 
-        NodeBuilderContext CallerBldrCtx(getCoreEngine(),
-                                         SFC->getCallSiteBlock(), CallerLCtx);
+        unsigned NVCaller = getNumVisited(CallerLCtx, SFC->getCallSiteBlock());
         return computeObjectUnderConstruction(
-            cast<Expr>(SFC->getCallSite()), State, &CallerBldrCtx, CallerLCtx,
+            cast<Expr>(SFC->getCallSite()), State, NVCaller, CallerLCtx,
             RTC->getConstructionContext(), CallOpts);
       } else {
         // We are on the top frame of the analysis. We do not know where is the
@@ -273,7 +271,7 @@ SVal ExprEngine::computeObjectUnderConstruction(
       EvalCallOptions PreElideCallOpts = CallOpts;
 
       SVal V = computeObjectUnderConstruction(
-          TCC->getConstructorAfterElision(), State, BldrCtx, LCtx,
+          TCC->getConstructorAfterElision(), State, NumVisitedCaller, LCtx,
           TCC->getConstructionContextAfterElision(), CallOpts);
 
       // FIXME: This definition of "copy elision has not failed" is unreliable.
@@ -346,7 +344,7 @@ SVal ExprEngine::computeObjectUnderConstruction(
       CallEventManager &CEMgr = getStateManager().getCallEventManager();
       auto getArgLoc = [&](CallEventRef<> Caller) -> std::optional<SVal> {
         const LocationContext *FutureSFC =
-            Caller->getCalleeStackFrame(BldrCtx->blockCount());
+            Caller->getCalleeStackFrame(NumVisitedCaller);
         // Return early if we are unable to reliably foresee
         // the future stack frame.
         if (!FutureSFC)
@@ -365,7 +363,7 @@ SVal ExprEngine::computeObjectUnderConstruction(
         // because this-argument is implemented as a normal argument in
         // operator call expressions but not in operator declarations.
         const TypedValueRegion *TVR = Caller->getParameterLocation(
-            *Caller->getAdjustedParameterIndex(Idx), BldrCtx->blockCount());
+            *Caller->getAdjustedParameterIndex(Idx), NumVisitedCaller);
         if (!TVR)
           return std::nullopt;
 
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
index a4a22ce10952c..f6ba3699312ec 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngineCallAndReturn.cpp
@@ -247,30 +247,34 @@ ProgramStateRef ExprEngine::removeStateTraitsUsedForArrayEvaluation(
 /// 1. CallExitBegin (triggers the start of call exit sequence)
 /// 2. Bind the return value
 /// 3. Run Remove dead bindings to clean up the dead symbols from the callee.
-/// 4. CallExitEnd (switch to the caller context)
+/// 4. CallExitEnd
 /// 5. PostStmt<CallExpr>
+/// Steps 1-3. happen in the callee context; but there is a context switch and
+/// steps 4-5. happen in the caller context.
 void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
   // Step 1 CEBNode was generated before the call.
-  const StackFrameContext *calleeCtx = CEBNode->getStackFrame();
+  const StackFrameContext *CalleeCtx = CEBNode->getStackFrame();
 
   // The parent context might not be a stack frame, so make sure we
   // look up the first enclosing stack frame.
-  const StackFrameContext *callerCtx =
-    calleeCtx->getParent()->getStackFrame();
+  const StackFrameContext *CallerCtx = CalleeCtx->getParent()->getStackFrame();
 
-  const Stmt *CE = calleeCtx->getCallSite();
-  ProgramStateRef state = CEBNode->getState();
+  const Stmt *CE = CalleeCtx->getCallSite();
+  ProgramStateRef State = CEBNode->getState();
   // Find the last statement in the function and the corresponding basic block.
-  const Stmt *LastSt = nullptr;
-  const CFGBlock *Blk = nullptr;
-  std::tie(LastSt, Blk) = getLastStmt(CEBNode);
+  auto [LastSt, Blk] = getLastStmt(CEBNode);
+
+  const CFGBlock *PrePurgeBlock =
+      isa_and_nonnull<ReturnStmt>(LastSt) ? Blk : &CEBNode->getCFG().getExit();
+  // The first half of this process happens in the callee context:
+  setCurrLocationContextAndBlock(CalleeCtx, PrePurgeBlock);
 
-  // Generate a CallEvent /before/ cleaning the state, so that we can get the
+  // Generate a CallEvent /before/ cleaning the State, so that we can get the
   // correct value for 'this' (if necessary).
   CallEventManager &CEMgr = getStateManager().getCallEventManager();
-  CallEventRef<> Call = CEMgr.getCaller(calleeCtx, state);
+  CallEventRef<> Call = CEMgr.getCaller(CalleeCtx, State);
 
-  // Step 2: generate node with bound return value: CEBNode -> BindedRetNode.
+  // Step 2: generate node with bound return value: CEBNode -> BoundRetNode.
 
   // If this variable is set to 'true' the analyzer will evaluate the call
   // statement we are about to exit again, instead of continuing the execution
@@ -281,11 +285,11 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
 
   if (const auto *DtorDecl =
           dyn_cast_or_null<CXXDestructorDecl>(Call->getDecl())) {
-    if (auto Idx = getPendingArrayDestruction(state, callerCtx)) {
+    if (auto Idx = getPendingArrayDestruction(State, CallerCtx)) {
       ShouldRepeatCall = *Idx > 0;
 
-      auto ThisVal = svalBuilder.getCXXThis(DtorDecl->getParent(), calleeCtx);
-      state = state->killBinding(ThisVal);
+      auto ThisVal = svalBuilder.getCXXThis(DtorDecl->getParent(), CalleeCtx);
+      State = State->killBinding(ThisVal);
     }
   }
 
@@ -293,12 +297,12 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
   if (CE) {
     if (const ReturnStmt *RS = dyn_cast_or_null<ReturnStmt>(LastSt)) {
       const LocationContext *LCtx = CEBNode->getLocationContext();
-      SVal V = state->getSVal(RS, LCtx);
+      SVal V = State->getSVal(RS, LCtx);
 
       // Ensure that the return type matches the type of the returned Expr.
-      if (wasDifferentDeclUsedForInlining(Call, calleeCtx)) {
+      if (wasDifferentDeclUsedForInlining(Call, CalleeCtx)) {
         QualType ReturnedTy =
-          CallEvent::getDeclaredResultType(calleeCtx->getDecl());
+            CallEvent::getDeclaredResultType(CalleeCtx->getDecl());
         if (!ReturnedTy.isNull()) {
           if (const Expr *Ex = dyn_cast<Expr>(CE)) {
             V = adjustReturnValue(V, Ex->getType(), ReturnedTy,
@@ -307,18 +311,18 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
         }
       }
 
-      state = state->BindExpr(CE, callerCtx, V);
+      State = State->BindExpr(CE, CallerCtx, V);
     }
 
     // Bind the constructed object value to CXXConstructExpr.
     if (const CXXConstructExpr *CCE = dyn_cast<CXXConstructExpr>(CE)) {
       loc::MemRegionVal This =
-        svalBuilder.getCXXThis(CCE->getConstructor()->getParent(), calleeCtx);
-      SVal ThisV = state->getSVal(This);
-      ThisV = state->getSVal(ThisV.castAs<Loc>());
-      state = state->BindExpr(CCE, callerCtx, ThisV);
+          svalBuilder.getCXXThis(CCE->getConstructor()->getParent(), CalleeCtx);
+      SVal ThisV = State->getSVal(This);
+      ThisV = State->getSVal(ThisV.castAs<Loc>());
+      State = State->BindExpr(CCE, CallerCtx, ThisV);
 
-      ShouldRepeatCall = shouldRepeatCtorCall(state, CCE, callerCtx);
+      ShouldRepeatCall = shouldRepeatCtorCall(State, CCE, CallerCtx);
     }
 
     if (const auto *CNE = dyn_cast<CXXNewExpr>(CE)) {
@@ -327,92 +331,85 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
       // region for later use.
       // Additionally cast the return value of the inlined operator new
       // (which is of type 'void *') to the correct object type.
-      SVal AllocV = state->getSVal(CNE, callerCtx);
+      SVal AllocV = State->getSVal(CNE, CallerCtx);
       AllocV = svalBuilder.evalCast(
           AllocV, CNE->getType(),
           getContext().getPointerType(getContext().VoidTy));
 
-      state = addObjectUnderConstruction(state, CNE, calleeCtx->getParent(),
+      State = addObjectUnderConstruction(State, CNE, CalleeCtx->getParent(),
                                          AllocV);
     }
   }
 
   if (!ShouldRepeatCall) {
-    state = removeStateTraitsUsedForArrayEvaluation(
-        state, dyn_cast_or_null<CXXConstructExpr>(CE), callerCtx);
+    State = removeStateTraitsUsedForArrayEvaluation(
+        State, dyn_cast_or_null<CXXConstructExpr>(CE), CallerCtx);
   }
 
-  // Step 3: BindedRetNode -> CleanedNodes
+  // Step 3: BoundRetNode -> CleanedNodes
   // If we can find a statement and a block in the inlined function, run remove
   // dead bindings before returning from the call. This is important to ensure
   // that we report the issues such as leaks in the stack contexts in which
   // they occurred.
   ExplodedNodeSet CleanedNodes;
   if (LastSt && Blk && AMgr.options.AnalysisPurgeOpt != PurgeNone) {
-    static SimpleProgramPointTag retValBind("ExprEngine", "Bind Return Value");
+    static SimpleProgramPointTag RetValBind("ExprEngine", "Bind Return Value");
     auto Loc = isa<ReturnStmt>(LastSt)
-                   ? ProgramPoint{PostStmt(LastSt, calleeCtx, &retValBind)}
-                   : ProgramPoint{EpsilonPoint(calleeCtx, /*Data1=*/nullptr,
-                                               /*Data2=*/nullptr, &retValBind)};
-    const CFGBlock *PrePurgeBlock =
-        isa<ReturnStmt>(LastSt) ? Blk : &CEBNode->getCFG().getExit();
-    bool isNew;
-    ExplodedNode *BindedRetNode = G.getNode(Loc, state, false, &isNew);
-    BindedRetNode->addPredecessor(CEBNode, G);
-    if (!isNew)
+                   ? ProgramPoint{PostStmt(LastSt, CalleeCtx, &RetValBind)}
+                   : ProgramPoint{EpsilonPoint(CalleeCtx, /*Data1=*/nullptr,
+                                               /*Data2=*/nullptr, &RetValBind)};
+
+    ExplodedNode *BoundRetNode = Engine.makeNode(Loc, State, CEBNode);
+    if (!BoundRetNode)
       return;
 
-    NodeBuilderContext Ctx(getCoreEngine(), PrePurgeBlock, BindedRetNode);
-    currBldrCtx = &Ctx;
-    // Here, we call the Symbol Reaper with 0 statement and callee location
-    // context, telling it to clean up everything in the callee's context
-    // (and its children). We use the callee's function body as a diagnostic
-    // statement, with which the program point will be associated.
-    removeDead(BindedRetNode, CleanedNodes, nullptr, calleeCtx,
-               calleeCtx->getAnalysisDeclContext()->getBody(),
-               ProgramPoint::PostStmtPurgeDeadSymbolsKind);
-    currBldrCtx = nullptr;
+    // We call removeDead in the context of the callee.
+    removeDead(
+        BoundRetNode, CleanedNodes, /*ReferenceStmt=*/nullptr, CalleeCtx,
+        /*DiagnosticStmt=*/CalleeCtx->getAnalysisDeclContext()->getBody(),
+        ProgramPoint::PostStmtPurgeDeadSymbolsKind);
   } else {
     CleanedNodes.Add(CEBNode);
   }
 
+  // The second half of this process happens in the caller context. This is an
+  // exception to the general rule that the current LocationContext and Block
+  // stay the same within a single call to dispatchWorkItem.
+  resetCurrLocationContextAndBlock();
+  setCurrLocationContextAndBlock(CallerCtx, CalleeCtx->getCallSiteBlock());
+  SaveAndRestore CBISave(currStmtIdx, CalleeCtx->getIndex());
+
   for (ExplodedNode *N : CleanedNodes) {
-    // Step 4: Generate the CallExit and leave the callee's context.
+    // Step 4: Generate the CallExitEnd node.
     // CleanedNodes -> CEENode
-    CallExitEnd Loc(calleeCtx, callerCtx);
-    bool isNew;
-    ProgramStateRef CEEState = (N == CEBNode) ? state : N->getState();
+    CallExitEnd Loc(CalleeCtx, CallerCtx);
+    ProgramStateRef CEEState = (N == CEBNode) ? State : N->getState();
 
-    ExplodedNode *CEENode = G.getNode(Loc, CEEState, false, &isNew);
-    CEENode->addPredecessor(N, G);
-    if (!isNew)
+    ExplodedNode *CEENode = Engine.makeNode(Loc, CEEState, N);
+    if (!CEENode)
       return;
 
     // Step 5: Perform the post-condition check of the CallExpr and enqueue the
     // result onto the work list.
     // CEENode -> Dst -> WorkList
-    NodeBuilderContext Ctx(Engine, calleeCtx->getCallSiteBlock(), CEENode);
-    SaveAndRestore<const NodeBuilderContext *> NBCSave(currBldrCtx, &Ctx);
-    SaveAndRestore CBISave(currStmtIdx, calleeCtx->getIndex());
 
     CallEventRef<> UpdatedCall = Call.cloneWithState(CEEState);
 
+    ExplodedNodeSet DstPostPostCallCallback;
+    getCheckerManager().runCheckersForPostCall(DstPostPostCallCallback, CEENode,
+                                               *UpdatedCall, *this,
+                                               /*wasInlined=*/true);
     ExplodedNodeSet DstPostCall;
     if (llvm::isa_and_nonnull<CXXNewExpr>(CE)) {
-      ExplodedNodeSet DstPostPostCallCallback;
-      getCheckerManager().runCheckersForPostCall(DstPostPostCallCallback,
-                                                 CEENode, *UpdatedCall, *this,
-                                                 /*wasInlined=*/true);
       for (ExplodedNode *I : DstPostPostCallCallback) {
         getCheckerManager().runCheckersForNewAllocator(
             cast<CXXAllocatorCall>(*UpdatedCall), DstPostCall, I, *this,
             /*wasInlined=*/true);
       }
     } else {
-      getCheckerManager().runCheckersForPostCall(DstPostCall, CEENode,
-                                                 *UpdatedCall, *this,
-                                                 /*wasInlined=*/true);
+      DstPostCall.insert(DstPostPostCallCallback);
     }
+
     ExplodedNodeSet Dst;
     if (const ObjCMethodCall *Msg = dyn_cast<ObjCMethodCall>(Call)) {
       getCheckerManager().runCheckersForPostObjCMessage(Dst, DstPostCall, *Msg,
@@ -428,11 +425,11 @@ void ExprEngine::processCallExit(ExplodedNode *CEBNode) {
     }
 
     // Enqueue the next element in the block.
-    for (ExplodedNodeSet::iterator PSI = Dst.begin(), PSE = Dst.end();
-         PSI != PSE; ++PSI) {
-      unsigned Idx = calleeCtx->getIndex() + (ShouldRepeatCall ? 0 : 1);
+    for (ExplodedNode *DstNode : Dst) {
+      unsigned Idx = CalleeCtx->getIndex() + (ShouldRepeatCall ? 0 : 1);
 
-      Engine.getWorkList()->enqueue(*PSI, calleeCtx->getCallSiteBlock(), Idx);
+      Engine.getWorkList()->enqueue(DstNode, CalleeCtx->getCallSiteBlock(),
+                                    Idx);
     }
   }
 }
diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp
index 139b6c873adce..9800fe01fcaf5 100644
--- a/clang/test/AST/ByteCode/cxx20.cpp
+++ b/clang/test/AST/ByteCode/cxx20.cpp
@@ -1304,3 +1304,30 @@ namespace PointerCmp {
   static_assert((void*)(&a.i + 1) != (void*)(&a.i[1])); // expected-error {{static assertion failed}}
   static_assert((void*)(&a.i[2] + 1) == (void*)(&a.i[3]));
 }
+
+namespace ExpandOnOPTEPointers {
+
+  template <class _BidirectionalIterator>
+  constexpr void inplace_merge(_BidirectionalIterator __first,
+                               _BidirectionalIterator __middle) {
+
+    if (__first != __middle)
+      ++__first;
+  }
+  template <class> struct bidirectional_iterator {
+    int *it_;
+    constexpr void operator++() { ++it_; }
+
+    friend constexpr bool operator!=(bidirectional_iterator x,
+                                     bidirectional_iterator y) {
+      return x.it_ != y.it_;
+    }
+  };
+  constexpr bool test() {
+    int *ia = new int[0];
+    inplace_merge(bidirectional_iterator<int *>(ia), bidirectional_iterator<int *>(ia + 0));
+    delete[] ia;
+    return true;
+  }
+  static_assert(test());
+}
diff --git a/clang/test/AST/ByteCode/libcxx/end-primitive-array-root-lifetime.cpp b/clang/test/AST/ByteCode/libcxx/end-primitive-array-root-lifetime.cpp
index 8cf50abcddec1..f5ba94d429960 100644
--- a/clang/test/AST/ByteCode/libcxx/end-primitive-array-root-lifetime.cpp
+++ b/clang/test/AST/ByteCode/libcxx/end-primitive-array-root-lifetime.cpp
@@ -15,7 +15,7 @@ concept range = requires { end; };
 template <class _Tp>
 concept input_range = input_iterator<_Tp>;
 template <class>
-concept forward_range = true;
+concept forward_range = false;
 template <range _Rp> struct owning_view {
   _Rp __r_;
 };
diff --git a/clang/test/AST/ByteCode/loops.cpp b/clang/test/AST/ByteCode/loops.cpp
index 38ab5613e1cbd..ff80ef5c6e2ed 100644
--- a/clang/test/AST/ByteCode/loops.cpp
+++ b/clang/test/AST/ByteCode/loops.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify %s
-// RUN: %clang_cc1 -std=c++14 -verify=ref %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++14 -verify                %s
+// RUN: %clang_cc1                                         -std=c++14 -verify=ref            %s
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify=expected-cpp20 %s
-// RUN: %clang_cc1 -std=c++20 -verify=ref %s
+// RUN: %clang_cc1                                         -std=c++20 -verify=ref            %s
 
 namespace WhileLoop {
   constexpr int f() {
@@ -351,4 +351,14 @@ namespace Scopes {
     return n;
   }
   static_assert(foo() == 14, "");
+
+  constexpr bool WhileConditionDecl() {
+    bool b = true;
+    for (int i = 0; i < 3; ++i) {
+      while (int x = 0) {
+      }
+    }
+    return true;
+  }
+  static_assert(WhileConditionDecl(), "");
 }
diff --git a/clang/test/AST/dump.cpp b/clang/test/AST/dump.cpp
index bbd388cbf0957..15c655388fac4 100644
--- a/clang/test/AST/dump.cpp
+++ b/clang/test/AST/dump.cpp
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -verify -fopenmp -ast-dump %s | FileCheck %s -implicit-check-not=openmp_structured_block
 // RUN: %clang_cc1 -verify -fopenmp-simd -ast-dump %s | FileCheck %s -implicit-check-not=openmp_structured_block
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -ast-dump %s | FileCheck %s --check-prefix=OMP60
 // expected-no-diagnostics
 
 int ga, gb;
@@ -72,7 +73,13 @@ void foo();
 // CHECK-NEXT:   |-OMPDeclareSimdDeclAttr {{.+}} <line:[[@LINE-4]]:1, col:34> Implicit BS_Inbranch
 // CHECK:        `-OMPDeclareSimdDeclAttr {{.+}} <line:[[@LINE-6]]:1, col:25> Implicit BS_Undefined
 
+// Beginning in OpenMP 5.2, this form began generating a warning that it
+// is deprecated and to use the 'begin declare target' form instead.
+#if _OPENMP <= 202011
 #pragma omp declare target
+#else
+#pragma omp begin declare target
+#endif // _OPENMP
 int bar() {
   int f;
   return f;
@@ -86,4 +93,22 @@ int bar() {
 // CHECK-NEXT:  | `-ReturnStmt {{.+}} <line:[[@LINE-8]]:3, col:10>
 // CHECK-NEXT:  |   `-ImplicitCastExpr {{.+}} <col:10> 'int' <LValueToRValue>
 // CHECK-NEXT:  |     `-DeclRefExpr {{.+}} <col:10> 'int' lvalue Var {{.+}} 'f' 'int'
-// CHECK-NEXT:  `-OMPDeclareTargetDeclAttr {{.+}} <line:75:21> Implicit MT_To DT_Any 1
+// CHECK-NEXT:  `-OMPDeclareTargetDeclAttr {{.+}} Implicit MT_To DT_Any 1
+
+#if _OPENMP >= 202411
+int v_enter;
+#pragma omp declare target enter(v_enter) device_type(nohost)
+int v_link;
+#pragma omp declare target link(v_link) device_type(host)
+int v_local;
+#pragma omp declare target local(v_local)
+#endif
+
+// OMP60:      FunctionDecl {{.+}} bar 'int ()'
+// OMP60:      `-OMPDeclareTargetDeclAttr {{.+}} Implicit MT_Enter DT_Any 1
+// OMP60:      VarDecl {{.+}} v_enter 'int'
+// OMP60-NEXT: `-OMPDeclareTargetDeclAttr {{.+}} Implicit MT_Enter DT_NoHost 4294967295
+// OMP60:      VarDecl {{.+}} v_link 'int'
+// OMP60-NEXT: `-OMPDeclareTargetDeclAttr {{.+}} Implicit MT_Link DT_Host 4294967295
+// OMP60:      VarDecl {{.+}} v_local 'int'
+// OMP60-NEXT: `-OMPDeclareTargetDeclAttr {{.+}} Implicit MT_Local DT_Any 4294967295
diff --git a/clang/test/Analysis/Scalable/command-line-interface.cpp b/clang/test/Analysis/Scalable/command-line-interface.cpp
new file mode 100644
index 0000000000000..a632f487f2bb7
--- /dev/null
+++ b/clang/test/Analysis/Scalable/command-line-interface.cpp
@@ -0,0 +1,22 @@
+// DEFINE: %{filecheck} = FileCheck %s --match-full-lines --check-prefix
+
+// The flags should behave the same way on the clang driver and also on CC1.
+
+// RUN: not %clang     -fsyntax-only %s --ssaf-tu-summary-file=foobar             2>&1 | %{filecheck}=NOT-MATCHING-THE-PATTERN
+// RUN: not %clang_cc1 -fsyntax-only %s --ssaf-tu-summary-file=foobar             2>&1 | %{filecheck}=NOT-MATCHING-THE-PATTERN
+// RUN: not %clang     -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.unknownfmt 2>&1 | %{filecheck}=UNKNOWN-FILE-FORMAT
+// RUN: not %clang_cc1 -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.unknownfmt 2>&1 | %{filecheck}=UNKNOWN-FILE-FORMAT
+// RUN: not %clang     -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.json       2>&1 | %{filecheck}=NO-EXTRACTORS-ENABLED
+// RUN: not %clang_cc1 -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.json       2>&1 | %{filecheck}=NO-EXTRACTORS-ENABLED
+// RUN: not %clang     -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.json --ssaf-extract-summaries=extractor1            2>&1 | %{filecheck}=NO-EXTRACTOR-WITH-NAME
+// RUN: not %clang_cc1 -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.json --ssaf-extract-summaries=extractor1            2>&1 | %{filecheck}=NO-EXTRACTOR-WITH-NAME
+// RUN: not %clang     -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.json --ssaf-extract-summaries=extractor1,extractor2 2>&1 | %{filecheck}=NO-EXTRACTORS-WITH-NAME
+// RUN: not %clang_cc1 -fsyntax-only %s --ssaf-tu-summary-file=%t.ssaf.json --ssaf-extract-summaries=extractor1,extractor2 2>&1 | %{filecheck}=NO-EXTRACTORS-WITH-NAME
+
+void empty() {}
+
+// NOT-MATCHING-THE-PATTERN: error: failed to parse the value of '--ssaf-tu-summary-file=foobar' the value must follow the '<path>.<format>' pattern [-Wscalable-static-analysis-framework]
+// UNKNOWN-FILE-FORMAT:      error: unknown output summary file format 'unknownfmt' specified by '--ssaf-tu-summary-file={{.+}}.ssaf.unknownfmt' [-Wscalable-static-analysis-framework]
+// NO-EXTRACTORS-ENABLED:    error: must enable some summary extractors using the '--ssaf-extract-summaries=' option [-Wscalable-static-analysis-framework]
+// NO-EXTRACTOR-WITH-NAME:   error: no summary extractor was registered with name: extractor1 [-Wscalable-static-analysis-framework]
+// NO-EXTRACTORS-WITH-NAME:  error: no summary extractors were registered with name: extractor1, extractor2 [-Wscalable-static-analysis-framework]
diff --git a/clang/test/Analysis/Scalable/downgradable-errors.cpp b/clang/test/Analysis/Scalable/downgradable-errors.cpp
new file mode 100644
index 0000000000000..494e3e71092ac
--- /dev/null
+++ b/clang/test/Analysis/Scalable/downgradable-errors.cpp
@@ -0,0 +1,15 @@
+// DEFINE: %{filecheck} = FileCheck %s --match-full-lines --check-prefix
+
+// RUN: not %clang -fsyntax-only %s --ssaf-tu-summary-file=foobar 2>&1 | %{filecheck}=DEFAULT-ERROR
+// RUN:     %clang -fsyntax-only %s --ssaf-tu-summary-file=foobar -Wno-error=scalable-static-analysis-framework 2>&1 | %{filecheck}=DEMOTED-TO-WARNING
+// RUN:     %clang -fsyntax-only %s --ssaf-tu-summary-file=foobar -Wno-scalable-static-analysis-framework 2>&1 | count 0
+
+// This test demonstrates that the "scalable-static-analysis-framework" diagnostics can be downgraded or completely silenced with the right flags.
+
+void empty() {}
+
+// DEFAULT-ERROR: error: failed to parse the value of '--ssaf-tu-summary-file=foobar' the value must follow the '<path>.<format>' pattern [-Wscalable-static-analysis-framework]
+// DEFAULT-ERROR: 1 error generated.
+
+// DEMOTED-TO-WARNING: warning: failed to parse the value of '--ssaf-tu-summary-file=foobar' the value must follow the '<path>.<format>' pattern [-Wscalable-static-analysis-framework]
+// DEMOTED-TO-WARNING: 1 warning generated.
diff --git a/clang/test/Analysis/Scalable/help.cpp b/clang/test/Analysis/Scalable/help.cpp
new file mode 100644
index 0000000000000..a2e72cd198af7
--- /dev/null
+++ b/clang/test/Analysis/Scalable/help.cpp
@@ -0,0 +1,21 @@
+// DEFINE: %{filecheck} = FileCheck %s --match-full-lines --check-prefix
+
+// RUN: %clang     --help 2>&1 | %{filecheck}=HELP
+// RUN: %clang_cc1 --help 2>&1 | %{filecheck}=HELP
+
+// HELP:       --ssaf-extract-summaries=<summary-names>
+// HELP-NEXT:    Comma-separated list of summary names to extract
+// HELP-NEXT:  --ssaf-list-extractors  Display the list of available SSAF summary extractors
+// HELP-NEXT:  --ssaf-list-formats     Display the list of available SSAF serialization formats
+// HELP-NEXT:  --ssaf-tu-summary-file=<path>.<format>
+// HELP-NEXT:    The output file for the extracted summaries. The extension selects which file format to use.
+
+// FIXME: --ssaf-list-{extractors,formats} only work with the `clang` driver.
+// RUN: %clang --ssaf-list-extractors 2>&1 | %{filecheck}=LIST-EXTRACTORS
+// LIST-EXTRACTORS: OVERVIEW: Available SSAF summary extractors:
+
+// RUN: %clang --ssaf-list-formats 2>&1 | %{filecheck}=LIST-FORMATS
+// LIST-FORMATS: OVERVIEW: Available SSAF serialization formats:
+// LIST-FORMATS:   json - JSON serialization format
+
+// RUN: %clang --ssaf-list-extractors --ssaf-list-formats 2>&1 | %{filecheck}=LIST-EXTRACTORS,LIST-FORMATS
diff --git a/clang/test/Analysis/Scalable/ssaf-format/list.test b/clang/test/Analysis/Scalable/ssaf-format/list.test
index 47a678766aed1..4d389d78543ef 100644
--- a/clang/test/Analysis/Scalable/ssaf-format/list.test
+++ b/clang/test/Analysis/Scalable/ssaf-format/list.test
@@ -5,5 +5,5 @@
 
 // CHECK: Registered serialization formats:
 // CHECK-EMPTY:
-// CHECK-NEXT:   1. JSON  JSON serialization format
+// CHECK-NEXT:   1. json  JSON serialization format
 // CHECK-NEXT:      Analyses: (none)
diff --git a/clang/test/Analysis/clang-suppress/class-template-specializations.cpp b/clang/test/Analysis/clang-suppress/class-template-specializations.cpp
new file mode 100644
index 0000000000000..94f18a41164d6
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/class-template-specializations.cpp
@@ -0,0 +1,348 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// ============================================================================
+// Group A: Basic class template — attribute on primary
+// ============================================================================
+
+// Placeholder types for triggering instantiations.
+// - Type{A,B} should match an unconstrained template type parameter.
+// - Specialized{A,B} should match some specialization pattern.
+struct TypeA{};
+struct TypeB{};
+struct SpecializedA{};
+struct SpecializedB{};
+
+template <typename T>
+class [[clang::suppress]] A_Primary {
+public:
+  void inline_method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+  void outline_method();
+  static void static_inline() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+  static void static_outline();
+};
+
+template <typename T>
+void A_Primary<T>::outline_method() {
+  // Out-of-line: lexical context is the translation unit.
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+template <typename T>
+void A_Primary<T>::static_outline() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void test_A() {
+  A_Primary<TypeA>().inline_method();
+  A_Primary<TypeA>().outline_method();
+  A_Primary<TypeA>::static_inline();
+  A_Primary<TypeA>::static_outline();
+  // Different instantiation.
+  A_Primary<TypeB>().inline_method();
+}
+
+// ============================================================================
+// Group B: Explicit full specialization — attribute isolation
+// ============================================================================
+
+// --- B1: attribute on primary only ---
+
+template <typename T>
+class [[clang::suppress]] B1_AttrOnPrimary {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+// Explicit specialization is independent — NOT suppressed.
+template <>
+struct B1_AttrOnPrimary<SpecializedA> {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_B1() {
+  B1_AttrOnPrimary<TypeA>().method();           // suppressed (primary)
+  B1_AttrOnPrimary<SpecializedA>().method();    // warns (spec, no attr)
+}
+
+// --- B2: attribute on specialization only ---
+
+template <typename T>
+struct B2_AttrOnSpec {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+template <>
+class [[clang::suppress]] B2_AttrOnSpec<SpecializedA> {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+void test_B2() {
+  B2_AttrOnSpec<TypeA>().method();              // warns (primary, no attr)
+  B2_AttrOnSpec<SpecializedA>().method();       // suppressed (spec has attr)
+}
+
+// --- B3: attribute on both primary and specialization ---
+
+template <typename T>
+class [[clang::suppress]] B3_AttrOnBoth {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+template <>
+class [[clang::suppress]] B3_AttrOnBoth<SpecializedA> {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+void test_B3() {
+  B3_AttrOnBoth<TypeA>().method();              // suppressed
+  B3_AttrOnBoth<SpecializedA>().method();       // suppressed
+}
+
+// ============================================================================
+// Group C: Partial specializations
+// ============================================================================
+
+// --- C1: attribute on partial specialization only ---
+
+template <typename T, typename U>
+struct C1_AttrOnPartial {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+template <typename T>
+class [[clang::suppress]] C1_AttrOnPartial<T, SpecializedA> {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+void test_C1() {
+  C1_AttrOnPartial<TypeA, TypeA>().method();            // warns (primary, no attr)
+  C1_AttrOnPartial<TypeA, SpecializedA>().method();     // suppressed (partial spec)
+}
+
+// --- C2: attribute on primary, partial spec has none ---
+
+template <typename T, typename U>
+class [[clang::suppress]] C2_AttrOnPrimary {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+template <typename T>
+struct C2_AttrOnPrimary<T, SpecializedA> {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_C2() {
+  C2_AttrOnPrimary<TypeA, TypeA>().method();            // suppressed (primary)
+  C2_AttrOnPrimary<TypeA, SpecializedA>().method();     // warns (partial spec, no attr)
+}
+
+// --- C3: two partial specializations, only one suppressed ---
+
+template <typename T, typename U>
+struct C3_TwoPartials {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+template <typename T>
+class [[clang::suppress]] C3_TwoPartials<T, SpecializedA> {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+template <typename T>
+struct C3_TwoPartials<T, SpecializedB> {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_C3() {
+  C3_TwoPartials<TypeA, TypeA>().method();              // warns (primary)
+  C3_TwoPartials<TypeA, SpecializedA>().method();       // suppressed (first partial)
+  C3_TwoPartials<TypeA, SpecializedB>().method();       // warns (second partial, no attr)
+}
+
+// ============================================================================
+// Group D: Forward-declared class template (chooseDefinitionRedecl path)
+// ============================================================================
+
+// The template is forward-declared, then defined. chooseDefinitionRedecl()
+// must find the definition among the redeclarations.
+
+// --- D1: Forward-declared without attribute, defined with attribute ---
+template <typename T>
+class D1_ForwardDeclared;
+
+template <typename T>
+class [[clang::suppress]] D1_ForwardDeclared {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+void test_D1() {
+  D1_ForwardDeclared<TypeA>().method();
+}
+
+// --- D2: Forward-declared without attribute, defined without attribute ---
+template <typename T>
+struct D2_ForwardDeclared_NoAttr;
+
+template <typename T>
+struct D2_ForwardDeclared_NoAttr {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_D2() {
+  D2_ForwardDeclared_NoAttr<TypeA>().method();
+}
+
+// ============================================================================
+// Group E: Specialization with out-of-line (OOL) methods
+// ============================================================================
+
+template <typename T>
+class [[clang::suppress]] E_SpecWithOOL {
+public:
+  void inline_method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+  void outline_method();
+};
+
+template <typename T>
+void E_SpecWithOOL<T>::outline_method() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+// Explicit specialization with attribute and out-of-line method.
+template <>
+class [[clang::suppress]] E_SpecWithOOL<SpecializedA> {
+public:
+  void inline_method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+  void outline_method();
+};
+
+// Out-of-line for the specialization — not suppressed.
+void E_SpecWithOOL<SpecializedA>::outline_method() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void test_E() {
+  E_SpecWithOOL<TypeA>().inline_method();
+  E_SpecWithOOL<TypeA>().outline_method();
+  E_SpecWithOOL<SpecializedA>().inline_method();
+  E_SpecWithOOL<SpecializedA>().outline_method();
+}
+
+// ============================================================================
+// Group F: Nested class inside class template specialization
+// ============================================================================
+
+template <typename T>
+class [[clang::suppress]] F_Outer {
+public:
+  struct Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+};
+
+template <typename T>
+struct F_Outer_NoAttr {
+  struct Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    }
+  };
+};
+
+void test_F() {
+  F_Outer<TypeA>::Inner().method();
+  F_Outer_NoAttr<TypeA>::Inner().method();
+}
+
+// ============================================================================
+// Group G: Class template with default template arguments
+// ============================================================================
+
+template <typename T, typename U = TypeA>
+class [[clang::suppress]] G_WithDefault {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+template <typename T, typename U = TypeA>
+struct G_WithDefault_NoAttr {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_G() {
+  G_WithDefault<TypeA>().method();               // uses default U=TypeA
+  G_WithDefault<TypeA, TypeB>().method();        // explicit U=TypeB
+  G_WithDefault_NoAttr<TypeA>().method();        // uses default U=TypeA
+}
+
+// ============================================================================
+// Group H: Explicit instantiation directive
+// ============================================================================
+
+template <typename T>
+class [[clang::suppress]] H_ExplicitInst {
+public:
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+// Explicit instantiation.
+template class H_ExplicitInst<SpecializedA>;
+
+void test_H() {
+  H_ExplicitInst<SpecializedA>().method();
+}
diff --git a/clang/test/Analysis/clang-suppress/classes.cpp b/clang/test/Analysis/clang-suppress/classes.cpp
new file mode 100644
index 0000000000000..4c285880c86ea
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/classes.cpp
@@ -0,0 +1,75 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Systematic tests for [[clang::suppress]] on non-template classes and methods.
+
+// ============================================================================
+// Group A: Attribute on class — inline method suppressed, out-of-line not
+// ============================================================================
+
+// Placeholder type for triggering instantiations.
+struct Type{};
+
+class [[clang::suppress]] SuppressedClass {
+  void foo() {
+    clang_analyzer_warnIfReached(); // no-warning: inline method in suppressed class
+  }
+
+  void bar();
+};
+
+void SuppressedClass::bar() {
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+// ============================================================================
+// Group B: Attribute on method declaration vs definition
+// ============================================================================
+
+class SuppressedMethodClass {
+  // Attribute on the inline definition — suppressed.
+  [[clang::suppress]] void foo() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+
+  // Attribute on the in-class declaration only — NOT honored at out-of-line def.
+  [[clang::suppress]] void bar1();
+
+  // No attribute on the in-class declaration.
+  void bar2();
+};
+
+void SuppressedMethodClass::bar1() {
+  clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+}
+
+// Attribute on the out-of-line definition — suppressed.
+[[clang::suppress]]
+void SuppressedMethodClass::bar2() {
+  clang_analyzer_warnIfReached(); // no-warning
+}
+
+// ============================================================================
+// Group C: Template member function with early instantiation
+// ============================================================================
+
+// The suppression mechanism walks the lexical DeclContext chain to find
+// suppression attributes. This test verifies that the walk follows template
+// instantiation patterns (not just primary templates) when the instantiation
+// point precedes the definition.
+
+struct Clazz {
+  template <typename T>
+  static void templated_memfn();
+};
+
+// This must come before the 'templated_memfn' is defined!
+void instantiate() {
+  Clazz::templated_memfn<Type>();
+}
+
+template <typename T>
+void Clazz::templated_memfn() {
+  [[clang::suppress]] clang_analyzer_warnIfReached(); // no-warning
+}
diff --git a/clang/test/Analysis/clang-suppress/diagnostic-identifiers.cpp b/clang/test/Analysis/clang-suppress/diagnostic-identifiers.cpp
new file mode 100644
index 0000000000000..293f6fe12021c
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/diagnostic-identifiers.cpp
@@ -0,0 +1,115 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Tests for [[clang::suppress]] with diagnostic identifier arguments.
+
+// ============================================================================
+// Group A: Bare [[clang::suppress]] vs. with identifier
+// ============================================================================
+
+void bare_suppress() {
+  [[clang::suppress]] {
+    clang_analyzer_warnIfReached(); // no-warning: bare suppress works
+  }
+}
+
+void suppress_with_identifier() {
+  // FIXME: This should suppress debug.ExprInspection warnings, but currently
+  // any identifier makes the suppression a no-op.
+  [[clang::suppress("debug.ExprInspection")]] {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+void suppress_with_wrong_identifier() {
+  // Even with the wrong checker name, the current behavior is the same:
+  // any identifier makes the suppression a no-op.
+  [[clang::suppress("alpha.SomeOtherChecker")]] {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+// ============================================================================
+// Group B: Identifier on declarations
+// ============================================================================
+
+[[clang::suppress]] void decl_bare_suppress() {
+  clang_analyzer_warnIfReached(); // no-warning
+}
+
+// FIXME: Should suppress, but currently identifiers disable suppression.
+[[clang::suppress("debug.ExprInspection")]] void decl_with_identifier() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+// ============================================================================
+// Group C: Identifier on class
+// ============================================================================
+
+struct [[clang::suppress]] C_BareSuppressedClass {
+  void method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+// FIXME: Should suppress, but identifiers disable suppression.
+struct [[clang::suppress("core")]] C_IdentifierSuppressedClass {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+// ============================================================================
+// Group D: Multiple identifiers
+// ============================================================================
+
+void multiple_identifiers() {
+  // FIXME: Multiple identifiers — currently treated as a no-op.
+  [[clang::suppress("core.NullDereference", "core.DivideZero")]] {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+// ============================================================================
+// Group E: Empty string identifier
+// ============================================================================
+
+void empty_string_identifier() {
+  // An empty string is still a non-empty identifier list.
+  [[clang::suppress("")]] {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+// ============================================================================
+// Group F: Mixed — bare suppress and identifier suppress in same function
+// ============================================================================
+
+void mixed_suppressions() {
+  [[clang::suppress]] {
+    clang_analyzer_warnIfReached(); // no-warning: bare suppress works
+  }
+
+  // FIXME: Should suppress too, but identifiers disable it.
+  [[clang::suppress("debug.ExprInspection")]] {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+// ============================================================================
+// Group G: Identifier on namespace
+// ============================================================================
+
+namespace [[clang::suppress]] G_BareNS {
+  void func() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+} // namespace G_BareNS
+
+// FIXME: Should suppress, but identifiers disable it.
+namespace [[clang::suppress("core")]] G_IdentifierNS {
+  void func() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+} // namespace G_IdentifierNS
diff --git a/clang/test/Analysis/clang-suppress/friends.cpp b/clang/test/Analysis/clang-suppress/friends.cpp
new file mode 100644
index 0000000000000..acd3193eca0e2
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/friends.cpp
@@ -0,0 +1,366 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Systematic tests for [[clang::suppress]] on classes with friend declarations.
+//
+// Pruned matrix of valid combinations:
+//   Axis 1: fwd-decl at namespace scope (yes / no)
+//   Axis 2: body location (inline / out-of-line)
+//   Axis 3: template (yes / no)
+//
+// Each case has a suppressed variant (class has [[clang::suppress]])
+// and an unsuppressed variant (class without it).
+
+// Placeholder types for triggering instantiations.
+// - Type{A,B} should match an unconstrained template type parameter.
+// - Specialized should match some specialization pattern.
+struct TypeA{};
+struct TypeB{};
+struct Specialized{};
+
+// ============================================================================
+// Group A: Non-template friend functions
+// ============================================================================
+
+// --- A1: no fwd-decl, inline body ---
+
+struct [[clang::suppress]] A1_Suppressed {
+  friend void a1_suppressed(A1_Suppressed) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+struct A1_Unsuppressed {
+  friend void a1_unsuppressed(A1_Unsuppressed) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_A1() {
+  a1_suppressed(A1_Suppressed{});
+  a1_unsuppressed(A1_Unsuppressed{});
+}
+
+// --- A2: no fwd-decl, out-of-line body ---
+
+struct [[clang::suppress]] A2_Suppressed {
+  friend void a2_suppressed(A2_Suppressed);
+};
+void a2_suppressed(A2_Suppressed) {
+  // Out-of-line: lexical parent is the translation unit, NOT the class.
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+struct A2_Unsuppressed {
+  friend void a2_unsuppressed(A2_Unsuppressed);
+};
+void a2_unsuppressed(A2_Unsuppressed) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+void test_A2() {
+  a2_suppressed(A2_Suppressed{});
+  a2_unsuppressed(A2_Unsuppressed{});
+}
+
+// --- A3: fwd-decl, inline body ---
+
+extern void a3_suppressed();
+struct [[clang::suppress]] A3_Suppressed {
+  friend void a3_suppressed() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+extern void a3_unsuppressed();
+struct A3_Unsuppressed {
+  friend void a3_unsuppressed() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_A3() {
+  a3_suppressed();
+  a3_unsuppressed();
+}
+
+// --- A4: fwd-decl, out-of-line body ---
+
+extern void a4_suppressed();
+struct [[clang::suppress]] A4_Suppressed {
+  friend void a4_suppressed();
+};
+void a4_suppressed() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+extern void a4_unsuppressed();
+struct A4_Unsuppressed {
+  friend void a4_unsuppressed();
+};
+void a4_unsuppressed() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+void test_A4() {
+  a4_suppressed();
+  a4_unsuppressed();
+}
+
+// ============================================================================
+// Group B: Friend function templates (primary template)
+// ============================================================================
+
+// --- B1: no fwd-decl, inline body ---
+
+struct [[clang::suppress]] B1_Suppressed {
+  template <typename T>
+  friend void b1_suppressed(B1_Suppressed, T) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+struct B1_Unsuppressed {
+  template <typename T>
+  friend void b1_unsuppressed(B1_Unsuppressed, T) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_B1() {
+  b1_suppressed(B1_Suppressed{}, TypeA{});
+  b1_unsuppressed(B1_Unsuppressed{}, TypeA{});
+}
+
+// --- B2: no fwd-decl, out-of-line body ---
+
+struct [[clang::suppress]] B2_Suppressed {
+  template <typename T>
+  friend void b2_suppressed(B2_Suppressed, T);
+};
+template <typename T>
+void b2_suppressed(B2_Suppressed, T) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+struct B2_Unsuppressed {
+  template <typename T>
+  friend void b2_unsuppressed(B2_Unsuppressed, T);
+};
+template <typename T>
+void b2_unsuppressed(B2_Unsuppressed, T) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+void test_B2() {
+  b2_suppressed(B2_Suppressed{}, TypeA{});
+  b2_unsuppressed(B2_Unsuppressed{}, TypeA{});
+}
+
+// --- B3: fwd-decl, inline body ---
+
+template <typename T>
+extern void b3_suppressed(T);
+struct [[clang::suppress]] B3_Suppressed {
+  template <typename T>
+  friend void b3_suppressed(T) {
+    // FIXME: This should be suppressed.
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+template <typename T>
+extern void b3_unsuppressed(T);
+struct B3_Unsuppressed {
+  template <typename T>
+  friend void b3_unsuppressed(T) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_B3() {
+  b3_suppressed(TypeA{});
+  b3_unsuppressed(TypeA{});
+}
+
+// --- B4: fwd-decl, out-of-line body ---
+
+template <typename T>
+extern void b4_suppressed(T);
+struct [[clang::suppress]] B4_Suppressed {
+  template <typename T>
+  friend void b4_suppressed(T);
+};
+template <typename T>
+void b4_suppressed(T) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+template <typename T>
+extern void b4_unsuppressed(T);
+struct B4_Unsuppressed {
+  template <typename T>
+  friend void b4_unsuppressed(T);
+};
+template <typename T>
+void b4_unsuppressed(T) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+void test_B4() {
+  b4_suppressed(TypeA{});
+  b4_unsuppressed(TypeA{});
+}
+
+// ============================================================================
+// Group C: Friend function template explicit specializations
+// ============================================================================
+
+// --- C1: primary inline in suppressed class, explicit spec defined out-of-line ---
+// The explicit specialization is NOT defined inside the class, so it should
+// NOT be suppressed.
+
+struct [[clang::suppress]] C1_Suppressed {
+  template <typename T>
+  friend void c1_suppressed(C1_Suppressed, T) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+template <>
+void c1_suppressed(C1_Suppressed, Specialized) {
+  // Explicit specialization defined outside the class — not suppressed.
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+void test_C1() {
+  c1_suppressed(C1_Suppressed{}, TypeA{});         // uses primary (suppressed)
+  c1_suppressed(C1_Suppressed{}, Specialized{});   // uses explicit spec (not suppressed)
+}
+
+// ============================================================================
+// Group D: Friend classes (declared, not defined inline — C++ forbids
+// defining a type in a friend declaration)
+// ============================================================================
+
+// --- D1: friend class only declared, defined outside ---
+
+struct [[clang::suppress]] D1_Suppressed {
+  friend struct D1_FriendOuter;
+};
+struct D1_FriendOuter {
+  void method() {
+    // Defined outside the suppressed class — not suppressed.
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_D1() {
+  D1_FriendOuter{}.method();
+}
+
+// ============================================================================
+// Group E: Edge cases
+// ============================================================================
+
+// --- E1: friend function in suppressed CLASS TEMPLATE (not just suppressed class) ---
+
+template <typename U>
+struct [[clang::suppress]] E1_SuppressedTmpl {
+  friend void e1_friend(E1_SuppressedTmpl) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+void test_E1() {
+  e1_friend(E1_SuppressedTmpl<TypeA>{});
+}
+
+// --- E2: friend function template in a nested suppressed class ---
+// The friend needs a parameter of the nested class type for ADL lookup.
+
+struct Outer_E2 {
+  struct [[clang::suppress]] Inner_E2 {
+    template <typename T>
+    friend void e2_inner_friend(Inner_E2, T) {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+};
+void test_E2() {
+  e2_inner_friend(Outer_E2::Inner_E2{}, TypeA{});
+}
+
+// --- E3: multiple redeclarations at namespace scope before friend decl ---
+
+template <typename T> void e3_multi_redecl(T);
+template <typename T> void e3_multi_redecl(T);
+template <typename T> void e3_multi_redecl(T);
+struct [[clang::suppress]] E3_Suppressed {
+  template <typename T>
+  friend void e3_multi_redecl(T) {
+    // FIXME: This should be suppressed.
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_E3() {
+  e3_multi_redecl(TypeA{});
+}
+
+// --- E4: friend in anonymous namespace ---
+
+namespace {
+struct [[clang::suppress]] E4_AnonSuppressed {
+  friend void e4_anon_friend(E4_AnonSuppressed) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+} // namespace
+void test_E4() {
+  e4_anon_friend(E4_AnonSuppressed{});
+}
+
+// --- E5: suppression on the friend declaration itself, not on the class ---
+// Friend functions need a parameter for ADL visibility.
+
+struct E5_ClassNotSuppressed {
+  [[clang::suppress]] friend void e5_suppressed(E5_ClassNotSuppressed) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+  friend void e5_unsuppressed(E5_ClassNotSuppressed) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_E5() {
+  e5_suppressed(E5_ClassNotSuppressed{});
+  e5_unsuppressed(E5_ClassNotSuppressed{});
+}
+
+// --- E6: friend function template in suppressed class template, with fwd-decl ---
+// Combines class template + function template + fwd-decl.
+
+template <typename T> void e6_combined(T);
+template <typename U>
+struct [[clang::suppress]] E6_SuppressedTmpl {
+  template <typename T>
+  friend void e6_combined(T) {
+    // FIXME: This should be suppressed.
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_E6() {
+  // Instantiate the class template to make the friend visible.
+  E6_SuppressedTmpl<TypeA> e6; // This line is IMPORTANT!
+  (void)e6;
+  e6_combined(TypeA{});
+}
+
+// --- E7: friend function template instantiated with multiple different types ---
+// Ensure suppression applies to ALL instantiations, not just one.
+
+struct [[clang::suppress]] E7_Suppressed {
+  template <typename T>
+  friend void e7_multi_inst(E7_Suppressed, T) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+void test_E7() {
+  e7_multi_inst(E7_Suppressed{}, TypeA{});
+  e7_multi_inst(E7_Suppressed{}, TypeB{});
+}
+
+// --- E8: friend function template with fwd-decl, instantiated with multiple types ---
+
+template <typename T> void e8_fwd_multi(T);
+struct [[clang::suppress]] E8_Suppressed {
+  template <typename T>
+  friend void e8_fwd_multi(T) {
+    // FIXME: This should be suppressed.
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+void test_E8() {
+  e8_fwd_multi(TypeA{});
+  e8_fwd_multi(TypeB{});
+}
diff --git a/clang/test/Analysis/clang-suppress/function-templates.cpp b/clang/test/Analysis/clang-suppress/function-templates.cpp
new file mode 100644
index 0000000000000..dda7700fe7ae1
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/function-templates.cpp
@@ -0,0 +1,93 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Systematic tests for [[clang::suppress]] on function templates and their
+// explicit specializations.
+
+// Placeholder types for triggering instantiations.
+// - Type should match an unconstrained template type parameter.
+// - Specialized should match a specialization pattern.
+struct Type{};
+struct Specialized{};
+
+// ============================================================================
+// Group A: Attribute on forward declaration only — NOT honored at definition
+// ============================================================================
+
+template <typename T> [[clang::suppress]] void FunctionTemplateSuppressed(T);
+template <typename T>
+void FunctionTemplateSuppressed(T) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+template <typename T>
+void FunctionTemplateUnsuppressed(T) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void test_fwd_decl_attr() {
+  FunctionTemplateSuppressed(Type{});
+  FunctionTemplateUnsuppressed(Type{});
+}
+
+// ============================================================================
+// Group B: Explicit full function specialization — attribute on specialization
+// ============================================================================
+
+// Only the Specialized specialization is suppressed.
+template <typename T>
+void ExplicitSpecAttrOnSpec(T) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+template <>
+[[clang::suppress]] void ExplicitSpecAttrOnSpec(Specialized) {
+  clang_analyzer_warnIfReached(); // no-warning
+}
+
+void test_attr_on_spec() {
+  ExplicitSpecAttrOnSpec(Type{});                // warns (primary)
+  ExplicitSpecAttrOnSpec(Specialized{});         // suppressed (explicit specialization)
+}
+
+// ============================================================================
+// Group C: Explicit full function specialization — attribute on primary
+// ============================================================================
+
+// Only the primary template is suppressed.
+template <typename T>
+[[clang::suppress]] void ExplicitSpecAttrOnPrimary(T) {
+  clang_analyzer_warnIfReached(); // no-warning
+}
+
+template <>
+void ExplicitSpecAttrOnPrimary(Specialized) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void test_attr_on_primary() {
+  ExplicitSpecAttrOnPrimary(Type{});             // suppressed (primary)
+  ExplicitSpecAttrOnPrimary(Specialized{});      // warns (explicit specialization)
+}
+
+// ============================================================================
+// Group D: Variadic template with suppress + explicit specialization override
+// ============================================================================
+
+template <typename... Args>
+[[clang::suppress]] void Variadic_Suppressed(Args...) {
+  clang_analyzer_warnIfReached(); // no-warning
+}
+
+// Variadic template function specialization — NOT suppressed.
+template <>
+void Variadic_Suppressed(Type, Specialized) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void test_variadic() {
+  Variadic_Suppressed();
+  Variadic_Suppressed(Type{});
+  Variadic_Suppressed(Type{}, Specialized{});
+}
diff --git a/clang/test/Analysis/clang-suppress/lambdas.cpp b/clang/test/Analysis/clang-suppress/lambdas.cpp
new file mode 100644
index 0000000000000..864069f957487
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/lambdas.cpp
@@ -0,0 +1,238 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Systematic tests for [[clang::suppress]] interaction with lambdas.
+
+// Placeholder type for triggering instantiations.
+struct Type{};
+
+// ============================================================================
+// Group A: Lambda in suppressed statement block
+// ============================================================================
+
+void lambda_in_suppressed_block() {
+  [[clang::suppress]] {
+    auto lam = []() {
+      clang_analyzer_warnIfReached(); // no-warning
+    };
+    lam();
+  }
+}
+
+void lambda_in_unsuppressed_block() {
+  auto lam = []() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  };
+  lam();
+}
+
+// ============================================================================
+// Group B: Lambda in suppressed class method
+// ============================================================================
+
+struct [[clang::suppress]] B_SuppressedClass {
+  void method_with_lambda() {
+    auto lam = []() {
+      clang_analyzer_warnIfReached(); // no-warning
+    };
+    lam();
+  }
+};
+
+struct B_UnsuppressedClass {
+  void method_with_lambda() {
+    auto lam = []() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    };
+    lam();
+  }
+};
+
+void test_B() {
+  B_SuppressedClass().method_with_lambda();
+  B_UnsuppressedClass().method_with_lambda();
+}
+
+// ============================================================================
+// Group C: Nested lambdas
+// ============================================================================
+
+void nested_lambda_suppressed() {
+  [[clang::suppress]] {
+    auto outer = []() {
+      auto inner = []() {
+        clang_analyzer_warnIfReached(); // no-warning
+      };
+      return inner();
+    };
+    return outer(); // no-warning
+  }
+}
+
+void nested_lambda_unsuppressed() {
+  auto outer = []() {
+    auto inner = []() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    };
+    inner();
+  };
+  outer();
+}
+
+// ============================================================================
+// Group D: Lambda with captures
+// ============================================================================
+
+int lambda_with_ref_capture_suppressed() {
+  int *x = 0;
+  [[clang::suppress]] {
+    auto lam = [&x]() {
+      return *x;
+    };
+    return lam(); // no-warning
+  }
+}
+
+int lambda_with_ref_capture_unsuppressed() {
+  int *x = 0;
+  auto lam = [&x]() {
+    return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
+  };
+  return lam();
+}
+
+int lambda_capture_by_value_suppressed() {
+  int *x = 0;
+  [[clang::suppress]] {
+    auto lam = [x]() {
+      return *x;
+    };
+    return lam(); // no-warning
+  }
+}
+
+// ============================================================================
+// Group E: Lambda in suppressed namespace
+// ============================================================================
+
+namespace [[clang::suppress]] SuppressedNS {
+  void func_with_lambda() {
+    auto lam = []() {
+      clang_analyzer_warnIfReached(); // no-warning
+    };
+    lam();
+  }
+} // namespace SuppressedNS
+
+// ============================================================================
+// Group F: Suppressed lambda, unsuppressed enclosing
+// ============================================================================
+
+void selective_suppression_unsup() {
+  auto unsuppressed_lam = []() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  };
+  unsuppressed_lam();
+}
+
+void selective_suppression_sup() {
+  [[clang::suppress]] auto suppressed_lam = []() {
+    clang_analyzer_warnIfReached(); // no-warning
+  };
+  suppressed_lam();
+}
+
+// ============================================================================
+// Group G: Lambda in template function
+// ============================================================================
+
+template <typename T>
+[[clang::suppress]] void tmpl_func_with_lambda(T) {
+  auto lam = []() {
+    clang_analyzer_warnIfReached(); // no-warning
+  };
+  lam();
+}
+
+template <typename T>
+void tmpl_func_with_lambda_unsuppressed(T) {
+  auto lam = []() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  };
+  lam();
+}
+
+void test_G() {
+  tmpl_func_with_lambda(Type{});
+  tmpl_func_with_lambda_unsuppressed(Type{});
+}
+
+// ============================================================================
+// Group H: Lambda in suppressed class template
+// ============================================================================
+
+template <typename T>
+struct [[clang::suppress]] H_SuppressedTmpl {
+  void method() {
+    auto lam = []() {
+      clang_analyzer_warnIfReached(); // no-warning
+    };
+    lam();
+  }
+};
+
+template <typename T>
+struct H_UnsuppressedTmpl {
+  void method() {
+    auto lam = []() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    };
+    lam();
+  }
+};
+
+void test_H() {
+  H_SuppressedTmpl<Type>().method();
+  H_UnsuppressedTmpl<Type>().method();
+}
+
+// ============================================================================
+// Group I: Immediately-invoked lambda expression
+// ============================================================================
+
+int iile_suppressed() {
+  [[clang::suppress]] {
+    return []() {
+      int *x = 0;
+      return *x;
+    }(); // no-warning
+  }
+}
+
+int iile_unsuppressed() {
+  return []() {
+    int *x = 0;
+    return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
+  }();
+}
+
+// ============================================================================
+// Group J: Generic lambda (C++14)
+// ============================================================================
+
+void generic_lambda_suppressed() {
+  [[clang::suppress]] {
+    auto lam = [](auto) {
+      clang_analyzer_warnIfReached(); // no-warning
+    };
+    lam(Type{});
+  }
+}
+
+void generic_lambda_unsuppressed() {
+  auto lam = [](auto) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  };
+  lam(Type{});
+}
diff --git a/clang/test/Analysis/clang-suppress/macros.cpp b/clang/test/Analysis/clang-suppress/macros.cpp
new file mode 100644
index 0000000000000..7458484a33b0a
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/macros.cpp
@@ -0,0 +1,186 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Systematic tests for [[clang::suppress]] interaction with macros.
+//
+// The fullyContains() function compares source ranges using
+// SourceManager::isBeforeInTranslationUnit, which handles macro
+// expansion locations. These tests verify that suppression works
+// correctly when bugs are reported inside macro expansions.
+
+// Placeholder type for triggering instantiations.
+struct Type{};
+
+// ============================================================================
+// Group A: Bug inside macro, suppression outside (using warnIfReached)
+// ============================================================================
+
+#define WARN clang_analyzer_warnIfReached()
+
+void macro_in_suppressed_block() {
+  [[clang::suppress]] {
+    WARN; // no-warning
+  }
+}
+
+void macro_in_unsuppressed_block() {
+  WARN; // expected-warning{{REACHABLE}}
+}
+
+// ============================================================================
+// Group B: Function-like macro with expression
+// ============================================================================
+
+#define DO_WARN() clang_analyzer_warnIfReached()
+
+void funclike_macro_suppressed() {
+  [[clang::suppress]] {
+    DO_WARN(); // no-warning
+  }
+}
+
+void funclike_macro_unsuppressed() {
+  DO_WARN(); // expected-warning{{REACHABLE}}
+}
+
+// ============================================================================
+// Group C: Nested macros
+// ============================================================================
+
+#define INNER_WARN() clang_analyzer_warnIfReached()
+#define OUTER_WARN() INNER_WARN()
+
+void nested_macro_suppressed() {
+  [[clang::suppress]] {
+    OUTER_WARN(); // no-warning
+  }
+}
+
+void nested_macro_unsuppressed() {
+  OUTER_WARN(); // expected-warning{{REACHABLE}}
+}
+
+// ============================================================================
+// Group D: Macro defining entire function body
+// ============================================================================
+
+#define BUGGY_BODY { clang_analyzer_warnIfReached(); }
+
+[[clang::suppress]] void func_with_macro_body()
+  BUGGY_BODY // no-warning
+
+void func_with_macro_body_unsuppressed()
+  BUGGY_BODY // expected-warning{{REACHABLE}}
+
+// ============================================================================
+// Group E: Macro in suppressed class method
+// ============================================================================
+
+struct [[clang::suppress]] MacroInSuppressedClass {
+  void method() {
+    WARN; // no-warning
+  }
+};
+
+struct MacroInUnsuppressedClass {
+  void method() {
+    WARN; // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_E() {
+  MacroInSuppressedClass().method();
+  MacroInUnsuppressedClass().method();
+}
+
+// ============================================================================
+// Group F: Macro expanding to suppression attribute + code
+// ============================================================================
+
+#define SUPPRESS_AND_WARN [[clang::suppress]] clang_analyzer_warnIfReached()
+
+void macro_suppression_wrapper() {
+  SUPPRESS_AND_WARN; // no-warning
+}
+
+// ============================================================================
+// Group G: Macro in template context
+// ============================================================================
+
+template <typename T>
+struct [[clang::suppress]] MacroInTemplate {
+  void method() {
+    WARN; // no-warning
+  }
+};
+
+template <typename T>
+struct MacroInTemplate_NoAttr {
+  void method() {
+    WARN; // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_G() {
+  MacroInTemplate<Type>().method();
+  MacroInTemplate_NoAttr<Type>().method();
+}
+
+// ============================================================================
+// Group H: Null dereference through direct null, suppressed at statement level
+// ============================================================================
+
+int macro_deref_suppressed() {
+  int *p = 0;
+  [[clang::suppress]] return *p; // no-warning
+}
+
+int macro_deref_unsuppressed() {
+  int *p = 0;
+  return *p; // expected-warning{{Dereference of null pointer (loaded from variable 'p')}}
+}
+
+// ============================================================================
+// Group I: Stringification and token pasting (shouldn't affect suppression)
+// ============================================================================
+
+#define STRINGIFY(x) #x
+#define CONCAT(a, b) a##b
+
+void stringify_suppressed() {
+  [[clang::suppress]] {
+    const char *s = STRINGIFY(hello);
+    (void)s;
+    int CONCAT(var, 1) = 0;
+    clang_analyzer_warnIfReached(); // no-warning
+    (void)var1;
+  }
+}
+
+void stringify_unsuppressed() {
+  const char *s = STRINGIFY(hello);
+  (void)s;
+  int CONCAT(var, 1) = 0;
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  (void)var1;
+}
+
+// ============================================================================
+// Group J: Multi-line macro with warnIfReached
+// ============================================================================
+
+#define MULTI_LINE_WARN  \
+  do {                   \
+    clang_analyzer_warnIfReached(); \
+  } while (0)
+
+void multiline_macro_suppressed() {
+  [[clang::suppress]] {
+    MULTI_LINE_WARN; // no-warning
+  }
+}
+
+void multiline_macro_unsuppressed() {
+  MULTI_LINE_WARN; // expected-warning{{REACHABLE}}
+}
diff --git a/clang/test/Analysis/clang-suppress/namespaces.cpp b/clang/test/Analysis/clang-suppress/namespaces.cpp
new file mode 100644
index 0000000000000..aa1d270ad405a
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/namespaces.cpp
@@ -0,0 +1,35 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core -verify %s
+
+// Systematic tests for [[clang::suppress]] on namespaces.
+
+// ============================================================================
+// Group A: Attributed namespace suppresses inline definitions
+// ============================================================================
+
+namespace [[clang::suppress]]
+suppressed_namespace {
+  int foo() {
+    int *x = 0;
+    return *x; // no-warning: inside attributed namespace
+  }
+
+  int ool_foo();
+}
+
+// Out-of-line definition in an attributed namespace is NOT suppressed.
+int suppressed_namespace::ool_foo() {
+    int *x = 0;
+    return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
+}
+
+// ============================================================================
+// Group B: Reopened namespace (without attribute) is NOT suppressed
+// ============================================================================
+
+// Another instance of the same namespace — the attribute does not carry over.
+namespace suppressed_namespace {
+  int bar() {
+    int *x = 0;
+    return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
+  }
+}
diff --git a/clang/test/Analysis/clang-suppress/nested-templates.cpp b/clang/test/Analysis/clang-suppress/nested-templates.cpp
new file mode 100644
index 0000000000000..0b5d1d5d98878
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/nested-templates.cpp
@@ -0,0 +1,340 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Placeholder types for triggering instantiations.
+// - Type{A,B,C,D} should match an unconstrained template type parameter.
+struct TypeA{};
+struct TypeB{};
+struct TypeC{};
+struct TypeD{};
+
+// ============================================================================
+// Group A: 2-level nesting — attribute on outer
+// ============================================================================
+
+template <typename A>
+struct [[clang::suppress]] TwoLevel_AttrOuter {
+  template <typename B>
+  struct Inner {
+    void inline_method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+    void outline_method();
+  };
+  void outer_inline() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+  void outer_outline();
+};
+
+template <typename A>
+template <typename B>
+void TwoLevel_AttrOuter<A>::Inner<B>::outline_method() {
+  // Out-of-line: lexical context is namespace, not the class.
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+template <typename A>
+void TwoLevel_AttrOuter<A>::outer_outline() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void test_two_level_outer() {
+  TwoLevel_AttrOuter<TypeA>::Inner<TypeB>().inline_method();
+  TwoLevel_AttrOuter<TypeA>::Inner<TypeB>().outline_method();
+  TwoLevel_AttrOuter<TypeA>().outer_inline();
+  TwoLevel_AttrOuter<TypeA>().outer_outline();
+}
+
+// ============================================================================
+// Group B: 2-level nesting — attribute on inner
+// ============================================================================
+
+template <typename A>
+struct TwoLevel_AttrInner {
+  template <typename B>
+  struct [[clang::suppress]] Inner {
+    void inline_method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+    void outline_method();
+  };
+  void outer_inline() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+template <typename A>
+template <typename B>
+void TwoLevel_AttrInner<A>::Inner<B>::outline_method() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void test_two_level_inner() {
+  TwoLevel_AttrInner<TypeA>::Inner<TypeB>().inline_method();
+  TwoLevel_AttrInner<TypeA>::Inner<TypeB>().outline_method();
+  TwoLevel_AttrInner<TypeA>().outer_inline();
+}
+
+// ============================================================================
+// Group C: 3-level nesting — attribute at each level
+// ============================================================================
+
+// --- C1: attribute on outermost ---
+
+template <typename A>
+struct [[clang::suppress]] ThreeLevel_AttrOuter {
+  template <typename B>
+  struct Mid {
+    template <typename C>
+    struct Inner {
+      void method() {
+        clang_analyzer_warnIfReached(); // no-warning
+      }
+    };
+    void mid_method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+};
+
+void test_three_level_outer() {
+  ThreeLevel_AttrOuter<TypeA>::Mid<TypeB>::Inner<TypeC>().method();
+  ThreeLevel_AttrOuter<TypeA>::Mid<TypeB>().mid_method();
+}
+
+// --- C2: attribute on middle ---
+
+template <typename A>
+struct ThreeLevel_AttrMid {
+  template <typename B>
+  struct [[clang::suppress]] Mid {
+    template <typename C>
+    struct Inner {
+      void method() {
+        clang_analyzer_warnIfReached(); // no-warning
+      }
+    };
+    void mid_method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+  void outer_method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_three_level_mid() {
+  ThreeLevel_AttrMid<TypeA>::Mid<TypeB>::Inner<TypeC>().method();
+  ThreeLevel_AttrMid<TypeA>::Mid<TypeB>().mid_method();
+  ThreeLevel_AttrMid<TypeA>().outer_method();
+}
+
+// --- C3: attribute on innermost ---
+
+template <typename A>
+struct ThreeLevel_AttrInner {
+  template <typename B>
+  struct Mid {
+    template <typename C>
+    struct [[clang::suppress]] Inner {
+      void method() {
+        clang_analyzer_warnIfReached(); // no-warning
+      }
+    };
+    void mid_method() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    }
+  };
+  void outer_method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_three_level_inner() {
+  ThreeLevel_AttrInner<TypeA>::Mid<TypeB>::Inner<TypeC>().method();
+  ThreeLevel_AttrInner<TypeA>::Mid<TypeB>().mid_method();
+  ThreeLevel_AttrInner<TypeA>().outer_method();
+}
+
+// --- C4: no attribute at any level (negative test) ---
+
+template <typename A>
+struct ThreeLevel_NoAttr {
+  template <typename B>
+  struct Mid {
+    template <typename C>
+    struct Inner {
+      void method() {
+        clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+      }
+    };
+    void mid_method() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    }
+  };
+  void outer_method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_three_level_none() {
+  ThreeLevel_NoAttr<TypeA>::Mid<TypeB>::Inner<TypeC>().method();
+  ThreeLevel_NoAttr<TypeA>::Mid<TypeB>().mid_method();
+  ThreeLevel_NoAttr<TypeA>().outer_method();
+}
+
+// ============================================================================
+// Group D: Mixed template / non-template nesting
+// ============================================================================
+
+// --- D1: non-template outer, template inner ---
+
+struct [[clang::suppress]] NonTmplOuter_TmplInner {
+  template <typename T>
+  struct Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+};
+
+struct NonTmplOuter_TmplInner_NoAttr {
+  template <typename T>
+  struct Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    }
+  };
+};
+
+void test_mixed_nontmpl_outer() {
+  NonTmplOuter_TmplInner::Inner<TypeA>().method();
+  NonTmplOuter_TmplInner_NoAttr::Inner<TypeA>().method();
+}
+
+// --- D2: template outer, non-template inner ---
+
+template <typename T>
+struct [[clang::suppress]] TmplOuter_NonTmplInner {
+  struct Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+};
+
+template <typename T>
+struct TmplOuter_NonTmplInner_NoAttr {
+  struct Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    }
+  };
+};
+
+void test_mixed_tmpl_outer() {
+  TmplOuter_NonTmplInner<TypeA>::Inner().method();
+  TmplOuter_NonTmplInner_NoAttr<TypeA>::Inner().method();
+}
+
+// ============================================================================
+// Group E: Multiple instantiations of the same nested template
+// ============================================================================
+
+// Ensure suppression applies across different instantiation parameters.
+
+template <typename A>
+struct [[clang::suppress]] MultiInst {
+  template <typename B>
+  struct Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+};
+
+void test_multi_inst() {
+  MultiInst<TypeA>::Inner<TypeA>().method();
+  MultiInst<TypeA>::Inner<TypeB>().method();
+  MultiInst<TypeB>::Inner<TypeA>().method();
+  MultiInst<TypeC>::Inner<TypeD>().method();
+}
+
+// ============================================================================
+// Group F: Nested template with methods that have their own template params
+// ============================================================================
+
+template <typename A>
+struct [[clang::suppress]] NestedWithTemplateMethods {
+  template <typename B>
+  struct Inner {
+    template <typename C>
+    void tmpl_method(C) {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+};
+
+template <typename A>
+struct NestedWithTemplateMethods_NoAttr {
+  template <typename B>
+  struct Inner {
+    template <typename C>
+    void tmpl_method(C) {
+      clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    }
+  };
+};
+
+void test_nested_tmpl_methods() {
+  NestedWithTemplateMethods<TypeA>::Inner<TypeB>().tmpl_method(TypeC{});
+  NestedWithTemplateMethods<TypeA>::Inner<TypeB>().tmpl_method(TypeD{});
+  NestedWithTemplateMethods_NoAttr<TypeA>::Inner<TypeB>().tmpl_method(TypeC{});
+}
+
+// ============================================================================
+// Group G: Attribute on both outer and inner (redundant but should work)
+// ============================================================================
+
+template <typename A>
+struct [[clang::suppress]] BothSuppressed {
+  template <typename B>
+  struct [[clang::suppress]] Inner {
+    void method() {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  };
+  void outer_method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+void test_both_suppressed() {
+  BothSuppressed<TypeA>::Inner<TypeB>().method();
+  BothSuppressed<TypeA>().outer_method();
+}
+
+// ============================================================================
+// Regression test for gh#182659
+// ============================================================================
+
+// Nested template structs where the inner method accesses a member of a
+// different specialization — verifies that the suppression mechanism does not
+// accidentally suppress legitimate warnings when walking instantiation chains.
+
+template <class> struct gh_182659_s1 {
+  template <class> struct gh_182659_s2 {
+    int i;
+    template <class T> int m(const gh_182659_s2<T>& s2) {
+      return s2.i; // expected-warning{{Undefined or garbage value returned to caller}}
+    }
+  };
+};
+
+void gh_182659() {
+  gh_182659_s1<TypeA>::gh_182659_s2<TypeA> s1;
+  gh_182659_s1<TypeA>::gh_182659_s2<TypeB> s2;
+  s1.m(s2);
+}
diff --git a/clang/test/Analysis/clang-suppress/statements.cpp b/clang/test/Analysis/clang-suppress/statements.cpp
new file mode 100644
index 0000000000000..be3e1fd0f1832
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/statements.cpp
@@ -0,0 +1,158 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// ============================================================================
+// Group A: Compound statement (block)
+// ============================================================================
+
+void suppress_compound_suppressed() {
+  [[clang::suppress]] {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+}
+
+void suppress_compound_unsuppressed() {
+  {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+// ============================================================================
+// Group B: If statement
+// ============================================================================
+
+void suppress_if(bool coin) {
+  [[clang::suppress]] if (coin) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+void suppress_if_else(bool coin) {
+  [[clang::suppress]] if (coin) {
+    clang_analyzer_warnIfReached(); // no-warning
+  } else {
+    clang_analyzer_warnIfReached(); // no-warning: entire if-else is suppressed
+  }
+}
+
+void unsuppressed_if_else(bool coin) {
+  if (coin) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  } else {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+// ============================================================================
+// Group C: Loop statements
+// ============================================================================
+
+void suppress_for(int n) {
+  [[clang::suppress]] for (int i = 0; i < n; ++i) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+}
+
+void suppress_while(int n) {
+  [[clang::suppress]] while (--n) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+}
+
+void suppress_do_while(int n) {
+  [[clang::suppress]] do {
+    clang_analyzer_warnIfReached(); // no-warning
+  } while (--n);
+}
+
+void unsuppressed_for(int n) {
+  for (int i = 0; i < n; ++i) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+}
+
+void suppress_range_for() {
+  int arr[] = {1, 2, 3};
+  [[clang::suppress]] for (int x : arr) {
+    clang_analyzer_warnIfReached(); // no-warning
+    (void)x;
+  }
+}
+
+// ============================================================================
+// Group D: Switch statement
+// ============================================================================
+
+int suppress_switch(int n) {
+  [[clang::suppress]] switch (n) {
+  case 1:
+    return clang_analyzer_warnIfReached(), 1; // no-warning
+  default:
+    break;
+  }
+  return 0;
+}
+
+int unsuppressed_switch(int n) {
+  switch (n) {
+  case 1:
+    return clang_analyzer_warnIfReached(), 1; // expected-warning{{REACHABLE}}
+  default:
+    break;
+  }
+  return 0;
+}
+
+// ============================================================================
+// Group E: Return statement
+// ============================================================================
+
+int suppress_return() {
+  [[clang::suppress]] return clang_analyzer_warnIfReached(), 1; // no-warning
+}
+
+int unsuppressed_return() {
+  return clang_analyzer_warnIfReached(), 1; // expected-warning{{REACHABLE}}
+}
+
+// ============================================================================
+// Group F: Expression statement
+// ============================================================================
+
+void suppress_expr_stmt() {
+  [[clang::suppress]] clang_analyzer_warnIfReached(); // no-warning
+}
+
+void unsuppressed_expr_stmt() {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+// ============================================================================
+// Group G: Nested suppressed blocks
+// ============================================================================
+
+void nested_suppression() {
+  [[clang::suppress]] {
+    [[clang::suppress]] {
+      clang_analyzer_warnIfReached(); // no-warning
+    }
+  }
+}
+
+// ============================================================================
+// Group H: Suppression on single statement within method
+// ============================================================================
+
+struct H_ClassWithMethods {
+  void method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+    [[clang::suppress]] clang_analyzer_warnIfReached(); // no-warning
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_H() {
+  H_ClassWithMethods().method();
+}
diff --git a/clang/test/Analysis/clang-suppress/template-methods.cpp b/clang/test/Analysis/clang-suppress/template-methods.cpp
new file mode 100644
index 0000000000000..3e013b11ab928
--- /dev/null
+++ b/clang/test/Analysis/clang-suppress/template-methods.cpp
@@ -0,0 +1,138 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
+
+void clang_analyzer_warnIfReached();
+
+// Systematic tests for [[clang::suppress]] on template methods inside
+// non-template and template classes.
+
+// Placeholder types for triggering instantiations.
+// - Type{A,B} should match an unconstrained template type parameter.
+struct TypeA{};
+struct TypeB{};
+
+// ============================================================================
+// Group A: Non-template class with suppressed/unsuppressed template methods
+// ============================================================================
+
+struct NonTemplateClassWithTemplatedMethod {
+  template <typename T>
+  [[clang::suppress]] void suppressed(T) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+
+  template <typename T>
+  void unsuppressed(T) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_nontpl_class() {
+  NonTemplateClassWithTemplatedMethod().suppressed(TypeA{});
+  NonTemplateClassWithTemplatedMethod().unsuppressed(TypeA{});
+}
+
+// ============================================================================
+// Group B: Template class with template methods — inline
+// ============================================================================
+
+template <typename T>
+struct TemplateClassWithTemplateInlineMethod {
+  template <typename U>
+  [[clang::suppress]] void suppressed(U) {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+
+  template <typename U>
+  void unsuppressed(U) {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_tpl_class_tpl_inline_method() {
+  TemplateClassWithTemplateInlineMethod<TypeA>().suppressed(TypeB{});
+  TemplateClassWithTemplateInlineMethod<TypeA>().unsuppressed(TypeB{});
+}
+
+// ============================================================================
+// Group C: Template class with template methods — out-of-line
+// ============================================================================
+
+template <typename T>
+struct TemplateClassWithTemplateOOLMethod {
+  template <typename U>
+  [[clang::suppress]] void suppress_at_decl_outline(U);
+
+  template <typename U>
+  void suppress_at_def_outline(U);
+};
+
+// Attribute on declaration only — NOT honored at out-of-line definition.
+template <typename T>
+template <typename U>
+void TemplateClassWithTemplateOOLMethod<T>::suppress_at_decl_outline(U) {
+  clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+}
+
+// Attribute on out-of-line definition — suppressed.
+template <typename T>
+template <typename U>
+[[clang::suppress]] void TemplateClassWithTemplateOOLMethod<T>::suppress_at_def_outline(U) {
+  clang_analyzer_warnIfReached(); // no-warning
+}
+
+void test_tpl_class_tpl_ool_method() {
+  TemplateClassWithTemplateOOLMethod<TypeA>().suppress_at_decl_outline(TypeB{});
+  TemplateClassWithTemplateOOLMethod<TypeA>().suppress_at_def_outline(TypeB{});
+}
+
+// ============================================================================
+// Group D: Template-template parameters
+// ============================================================================
+
+// A simple "box" template used as a template-template argument.
+template <typename T>
+struct Box {
+  void get() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+// A version of Box that suppresses its own methods.
+template <typename T>
+class [[clang::suppress]] SuppressedBox {
+public:
+  void get() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+// Adaptor whose own methods are suppressed; the contained Box's methods are not.
+template <typename T, template <typename> class Container>
+class [[clang::suppress]] SuppressedAdaptor {
+public:
+  Container<T> data;
+
+  void adaptor_method() {
+    clang_analyzer_warnIfReached(); // no-warning
+  }
+};
+
+// Adaptor with no suppression; Box's own suppression is independent.
+template <typename T, template <typename> class Container>
+struct UnsuppressedAdaptor {
+  Container<T> data;
+
+  void adaptor_method() {
+    clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}}
+  }
+};
+
+void test_template_template() {
+  // SuppressedAdaptor<Box>: adaptor method suppressed; Box::get not affected.
+  SuppressedAdaptor<TypeA, Box>().adaptor_method();  // suppressed by adaptor's attr
+  SuppressedAdaptor<TypeA, Box>().data.get();        // warns — Box has no attr, different lexical context
+
+  // UnsuppressedAdaptor<SuppressedBox>: adaptor warns; SuppressedBox::get suppressed.
+  UnsuppressedAdaptor<TypeA, SuppressedBox>().adaptor_method();  // warns — adaptor has no attr
+  UnsuppressedAdaptor<TypeA, SuppressedBox>().data.get();        // suppressed by SuppressedBox's attr
+}
diff --git a/clang/test/Analysis/suppression-attr.cpp b/clang/test/Analysis/suppression-attr.cpp
deleted file mode 100644
index 9ba56d976fddb..0000000000000
--- a/clang/test/Analysis/suppression-attr.cpp
+++ /dev/null
@@ -1,91 +0,0 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -verify %s
-
-void clang_analyzer_warnIfReached();
-
-struct Clazz {
-  template <typename T>
-  static void templated_memfn();
-};
-
-// This must come before the 'templated_memfn' is defined!
-static void instantiate() {
-  Clazz::templated_memfn<int>();
-}
-
-template <typename T>
-void Clazz::templated_memfn() {
-  // When we report a bug in a function, we traverse the lexical decl context
-  // of it while looking for suppression attributes to record what source
-  // ranges should the suppression apply to.
-  // In the past, that traversal didn't follow template instantiations, only
-  // primary templates.
-  [[clang::suppress]] clang_analyzer_warnIfReached(); // no-warning
-
-}
-
-namespace [[clang::suppress]]
-suppressed_namespace {
-  int foo() {
-    int *x = 0;
-    return *x;
-  }
-
-  int foo_forward();
-}
-
-int suppressed_namespace::foo_forward() {
-    int *x = 0;
-    return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
-}
-
-// Another instance of the same namespace.
-namespace suppressed_namespace {
-  int bar() {
-    int *x = 0;
-    return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
-  }
-}
-
-void lambda() {
-  [[clang::suppress]] {
-    auto lam = []() {
-      int *x = 0;
-      return *x;
-    };
-  }
-}
-
-class [[clang::suppress]] SuppressedClass {
-  int foo() {
-    int *x = 0;
-    return *x;
-  }
-
-  int bar();
-};
-
-int SuppressedClass::bar() {
-  int *x = 0;
-  return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
-}
-
-class SuppressedMethodClass {
-  [[clang::suppress]] int foo() {
-    int *x = 0;
-    return *x;
-  }
-
-  [[clang::suppress]] int bar1();
-  int bar2();
-};
-
-int SuppressedMethodClass::bar1() {
-  int *x = 0;
-  return *x; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
-}
-
-[[clang::suppress]]
-int SuppressedMethodClass::bar2() {
-  int *x = 0;
-  return *x; // no-warning
-}
diff --git a/clang/test/C/C2y/n3517.c b/clang/test/C/C2y/n3517.c
new file mode 100644
index 0000000000000..bfa5d85bb58e8
--- /dev/null
+++ b/clang/test/C/C2y/n3517.c
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic -Wno-unused %s
+
+/* WG14 N3517: No
+ * Array subscripting without decay
+ *
+ * 1. Unconventional subscripting like 0[a] is made obsolescent.
+ * 2. A negative index in subscripting array violates constraints.
+ * 3. Subscripting a non-lvalue array member results in a non-lvalue.
+ *
+ * FIXME: Clang doesn't yet implement this paper.
+ */
+
+struct S {
+  int a[1];
+};
+
+struct S get_value();
+
+// FIXME: Should diagnose these.
+void test_constraint_violation() {  
+  &(get_value().a[0]);
+  get_value().a[0] = 42;
+
+  struct S s = {{0}};
+  &((0, s).a[0]);
+  (0, s).a[0] = 42;
+
+  int arr[1] = {0}; // expected-note {{declared here}}
+
+  s.a[-1];
+  (0, s).a[-1];
+  get_value().a[-1];
+  arr[-1]; // expected-warning {{before the beginning of the array}}
+}
+
+// FIXME: Should diagnose these.
+void test_deprecation() {
+  int arr[1] = {0};
+  0[arr];
+
+  int* ptr = arr;
+  0[ptr];
+
+  struct S s = {{0}};
+  0[s.a];
+}
diff --git a/clang/test/C/C2y/n3652.c b/clang/test/C/C2y/n3652.c
new file mode 100644
index 0000000000000..70385e0fcd5b2
--- /dev/null
+++ b/clang/test/C/C2y/n3652.c
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic -Wno-unused %s
+
+/* WG14 N3652: No
+ * Composite types, v1.3
+ *
+ * For the conditional operator, constraints involving nullptr_t and pointers
+ * to variably modified types are added.
+ *
+ * FIXME: Clang doesn't yet implement this paper.
+ */
+
+// expected-no-diagnostics
+
+// FIXME: Should diagnose these.
+void test(bool cond, void* p1, void* p2) {
+  int n  = 2;
+  auto a = cond ? nullptr : (char(*)[n])p1;
+  auto b = cond ? (char(*)[])p1 : (char(*)[n])p2;
+}
diff --git a/clang/test/C/C2y/n3715.c b/clang/test/C/C2y/n3715.c
new file mode 100644
index 0000000000000..85cdbfa2abfe3
--- /dev/null
+++ b/clang/test/C/C2y/n3715.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic -Wno-unused -Wno-deprecated %s
+
+/* WG14 N3715: No
+ * static_assert expressions
+ *
+ * A successful static_assert is also an expression of type void.
+ *
+ * FIXME: Clang doesn't yet implement this paper.
+ */
+
+// FIXME: Should accept these.
+void test() {
+  _Generic(static_assert(true, ""), void: (void)0);  // expected-error {{expected expression}}
+  _Generic(static_assert(true), void: (void)0);      // expected-error {{expected expression}}
+  _Generic(_Static_assert(true, ""), void: (void)0); // expected-error {{expected expression}}
+  _Generic(_Static_assert(true), void: (void)0);     // expected-error {{expected expression}}
+}
diff --git a/clang/test/CIR/CodeGen/atomic.c b/clang/test/CIR/CodeGen/atomic.c
index 200ebc549ef2b..6476f35009e3e 100644
--- a/clang/test/CIR/CodeGen/atomic.c
+++ b/clang/test/CIR/CodeGen/atomic.c
@@ -266,7 +266,7 @@ void c11_atomic_cmpxchg_strong(_Atomic(int) *ptr, int *expected, int desired, in
   __c11_atomic_compare_exchange_strong(ptr, expected, desired,
                                        __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg success(seq_cst) failure(acquire) syncscope(system) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
-  // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+  // CIR-NEXT:    %[[FAILED:.+]] = cir.not %[[SUCCESS]] : !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
   // CIR-NEXT:    }
@@ -348,7 +348,7 @@ void c11_atomic_cmpxchg_weak(_Atomic(int) *ptr, int *expected, int desired, int
   __c11_atomic_compare_exchange_weak(ptr, expected, desired,
                                      __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) syncscope(system) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
-  // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+  // CIR-NEXT:    %[[FAILED:.+]] = cir.not %[[SUCCESS]] : !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
   // CIR-NEXT:    }
@@ -429,7 +429,7 @@ void atomic_cmpxchg(int *ptr, int *expected, int *desired, int failure) {
 
   __atomic_compare_exchange(ptr, expected, desired, /*weak=*/0, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg success(seq_cst) failure(acquire) syncscope(system) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
-  // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+  // CIR-NEXT:    %[[FAILED:.+]] = cir.not %[[SUCCESS]] : !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
   // CIR-NEXT:    }
@@ -460,7 +460,7 @@ void atomic_cmpxchg(int *ptr, int *expected, int *desired, int failure) {
 
   __atomic_compare_exchange(ptr, expected, desired, /*weak=*/1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) syncscope(system) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
-  // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+  // CIR-NEXT:    %[[FAILED:.+]] = cir.not %[[SUCCESS]] : !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
   // CIR-NEXT:    }
@@ -581,7 +581,7 @@ void atomic_cmpxchg_n(int *ptr, int *expected, int desired, int failure) {
 
   __atomic_compare_exchange_n(ptr, expected, desired, /*weak=*/0, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg success(seq_cst) failure(acquire) syncscope(system) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
-  // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+  // CIR-NEXT:    %[[FAILED:.+]] = cir.not %[[SUCCESS]] : !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
   // CIR-NEXT:    }
@@ -612,7 +612,7 @@ void atomic_cmpxchg_n(int *ptr, int *expected, int desired, int failure) {
 
   __atomic_compare_exchange_n(ptr, expected, desired, /*weak=*/1, __ATOMIC_SEQ_CST, __ATOMIC_ACQUIRE);
   // CIR:         %[[OLD:.+]], %[[SUCCESS:.+]] = cir.atomic.cmpxchg weak success(seq_cst) failure(acquire) syncscope(system) %{{.+}}, %{{.+}}, %{{.+}} align(4) : (!cir.ptr<!s32i>, !s32i, !s32i) -> (!s32i, !cir.bool)
-  // CIR-NEXT:    %[[FAILED:.+]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+  // CIR-NEXT:    %[[FAILED:.+]] = cir.not %[[SUCCESS]] : !cir.bool
   // CIR-NEXT:    cir.if %[[FAILED]] {
   // CIR-NEXT:      cir.store align(4) %[[OLD]], %{{.+}} : !s32i, !cir.ptr<!s32i>
   // CIR-NEXT:    }
@@ -2547,7 +2547,7 @@ void test_op_and_fetch() {
   // CIR: [[VAL0:%.*]] = cir.cast integral {{%.*}} : !u8i -> !s8i
   // CIR: [[RES0:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL0]] : (!cir.ptr<!s8i>, !s8i) -> !s8i
   // CIR: [[INTERM0:%.*]] = cir.and [[RES0]], [[VAL0]] : !s8i
-  // CIR: [[RET0:%.*]] =  cir.unary(not, [[INTERM0]]) : !s8i, !s8i
+  // CIR: [[RET0:%.*]] =  cir.not [[INTERM0]] : !s8i
   // LLVM:  [[VAL0:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[RES0:%.*]] = atomicrmw nand ptr %{{.*}}, i8 [[VAL0]] seq_cst, align 1
   // LLVM:  [[INTERM0:%.*]] = and i8 [[RES0]], [[VAL0]]
@@ -2562,7 +2562,7 @@ void test_op_and_fetch() {
 
   // CIR: [[RES1:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL1:%.*]] : (!cir.ptr<!u8i>, !u8i) -> !u8i
   // CIR: [[INTERM1:%.*]] = cir.and [[RES1]], [[VAL1]] : !u8i
-  // CIR: [[RET1:%.*]] = cir.unary(not, [[INTERM1]]) : !u8i, !u8i
+  // CIR: [[RET1:%.*]] = cir.not [[INTERM1]] : !u8i
   // LLVM:  [[VAL1:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[RES1:%.*]] = atomicrmw nand ptr %{{.*}}, i8 [[VAL1]] seq_cst, align 1
   // LLVM:  [[INTERM1:%.*]] = and i8 [[RES1]], [[VAL1]]
@@ -2578,7 +2578,7 @@ void test_op_and_fetch() {
   // CIR: [[VAL2:%.*]] = cir.cast integral {{%.*}} : !u8i -> !s16i
   // CIR: [[RES2:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL2]] : (!cir.ptr<!s16i>, !s16i) -> !s16i
   // CIR: [[INTERM2:%.*]] = cir.and [[RES2]], [[VAL2]] : !s16i
-  // CIR: [[RET2:%.*]] =  cir.unary(not, [[INTERM2]]) : !s16i, !s16i
+  // CIR: [[RET2:%.*]] =  cir.not [[INTERM2]] : !s16i
   // LLVM:  [[VAL2:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[CONV2:%.*]] = zext i8 [[VAL2]] to i16
   // LLVM:  [[RES2:%.*]] = atomicrmw nand ptr %{{.*}}, i16 [[CONV2]] seq_cst, align 2
@@ -2596,7 +2596,7 @@ void test_op_and_fetch() {
   // CIR: [[VAL3:%.*]] = cir.cast integral {{%.*}} : !u8i -> !u16i
   // CIR: [[RES3:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL3]] : (!cir.ptr<!u16i>, !u16i) -> !u16i
   // CIR: [[INTERM3:%.*]] = cir.and [[RES3]], [[VAL3]] : !u16i
-  // CIR: [[RET3:%.*]] =  cir.unary(not, [[INTERM3]]) : !u16i, !u16i
+  // CIR: [[RET3:%.*]] =  cir.not [[INTERM3]] : !u16i
   // LLVM:  [[VAL3:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[CONV3:%.*]] = zext i8 [[VAL3]] to i16
   // LLVM:  [[RES3:%.*]] = atomicrmw nand ptr %{{.*}}, i16 [[CONV3]] seq_cst, align 2
@@ -2614,7 +2614,7 @@ void test_op_and_fetch() {
   // CIR: [[VAL4:%.*]] = cir.cast integral {{%.*}} : !u8i -> !s32i
   // CIR: [[RES4:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL4]] : (!cir.ptr<!s32i>, !s32i) -> !s32i
   // CIR: [[INTERM4:%.*]] = cir.and [[RES4]], [[VAL4]] : !s32i
-  // CIR: [[RET4:%.*]] =  cir.unary(not, [[INTERM4]]) : !s32i, !s32i
+  // CIR: [[RET4:%.*]] =  cir.not [[INTERM4]] : !s32i
   // LLVM:  [[VAL4:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[CONV4:%.*]] = zext i8 [[VAL4]] to i32
   // LLVM:  [[RES4:%.*]] = atomicrmw nand ptr %{{.*}}, i32 [[CONV4]] seq_cst, align 4
@@ -2632,7 +2632,7 @@ void test_op_and_fetch() {
   // CIR: [[VAL5:%.*]] = cir.cast integral {{%.*}} : !u8i -> !u32i
   // CIR: [[RES5:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL5]] : (!cir.ptr<!u32i>, !u32i) -> !u32i
   // CIR: [[INTERM5:%.*]] = cir.and [[RES5]], [[VAL5]] : !u32i
-  // CIR: [[RET5:%.*]] =  cir.unary(not, [[INTERM5]]) : !u32i, !u32i
+  // CIR: [[RET5:%.*]] =  cir.not [[INTERM5]] : !u32i
   // LLVM:  [[VAL5:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[CONV5:%.*]] = zext i8 [[VAL5]] to i32
   // LLVM:  [[RES5:%.*]] = atomicrmw nand ptr %{{.*}}, i32 [[CONV5]] seq_cst, align 4
@@ -2650,7 +2650,7 @@ void test_op_and_fetch() {
   // CIR: [[VAL6:%.*]] = cir.cast integral {{%.*}} : !u8i -> !s64i
   // CIR: [[RES6:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL6]] : (!cir.ptr<!s64i>, !s64i) -> !s64i
   // CIR: [[INTERM6:%.*]] = cir.and [[RES6]], [[VAL6]] : !s64i
-  // CIR: [[RET6:%.*]] =  cir.unary(not, [[INTERM6]]) : !s64i, !s64i
+  // CIR: [[RET6:%.*]] =  cir.not [[INTERM6]] : !s64i
   // LLVM:  [[VAL6:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[CONV6:%.*]] = zext i8 [[VAL6]] to i64
   // LLVM:  [[RES6:%.*]] = atomicrmw nand ptr %{{.*}}, i64 [[CONV6]] seq_cst, align 8
@@ -2668,7 +2668,7 @@ void test_op_and_fetch() {
   // CIR: [[VAL7:%.*]] = cir.cast integral {{%.*}} : !u8i -> !u64i
   // CIR: [[RES7:%.*]] = cir.atomic.fetch nand seq_cst syncscope(system) fetch_first {{%.*}}, [[VAL7]] : (!cir.ptr<!u64i>, !u64i) -> !u64i
   // CIR: [[INTERM7:%.*]] = cir.and [[RES7]], [[VAL7]] : !u64i
-  // CIR: [[RET7:%.*]] =  cir.unary(not, [[INTERM7]]) : !u64i, !u64i
+  // CIR: [[RET7:%.*]] =  cir.not [[INTERM7]] : !u64i
   // LLVM:  [[VAL7:%.*]] = load i8, ptr %{{.*}}, align 1
   // LLVM:  [[CONV7:%.*]] = zext i8 [[VAL7]] to i64
   // LLVM:  [[RES7:%.*]] = atomicrmw nand ptr %{{.*}}, i64 [[CONV7]] seq_cst, align 8
diff --git a/clang/test/CIR/CodeGen/binop.cpp b/clang/test/CIR/CodeGen/binop.cpp
index 0a62ab87cf6fa..f55ff858bf79e 100644
--- a/clang/test/CIR/CodeGen/binop.cpp
+++ b/clang/test/CIR/CodeGen/binop.cpp
@@ -127,10 +127,10 @@ void b0(int a, int b) {
 // OGCG:         ret void
 
 void testFloatingPointBinOps(float a, float b) {
-  a * b;
-  a / b;
-  a + b;
-  a - b;
+  float x = a * b;
+  x = x / b;
+  x = x + b;
+  x = x - b;
 }
 
 // CIR-LABEL: cir.func{{.*}} @_Z23testFloatingPointBinOpsff(
@@ -144,48 +144,58 @@ void testFloatingPointBinOps(float a, float b) {
 // LLVM-SAME: float {{.*}} %[[A:.*]], float {{.*}} %[[B:.*]])
 // LLVM:         %[[A_ADDR:.*]] = alloca float, i64 1
 // LLVM:         %[[B_ADDR:.*]] = alloca float, i64 1
+// LLVM:         %[[X_ADDR:.*]] = alloca float, i64 1
 // LLVM:         store float %[[A]], ptr %[[A_ADDR]]
 // LLVM:         store float %[[B]], ptr %[[B_ADDR]]
 
 // LLVM:         %[[A1:.*]] = load float, ptr %[[A_ADDR]]
 // LLVM:         %[[B1:.*]] = load float, ptr %[[B_ADDR]]
-// LLVM:         fmul float %[[A1]], %[[B1]]
+// LLVM:         %[[MUL:.*]] = fmul float %[[A1]], %[[B1]]
+// LLVM:         store float %[[MUL]], ptr %[[X_ADDR]]
 
-// LLVM:         %[[A2:.*]] = load float, ptr %[[A_ADDR]]
+// LLVM:         %[[X1:.*]] = load float, ptr %[[X_ADDR]]
 // LLVM:         %[[B2:.*]] = load float, ptr %[[B_ADDR]]
-// LLVM:         fdiv float %[[A2]], %[[B2]]
+// LLVM:         %[[DIV:.*]] = fdiv float %[[X1]], %[[B2]]
+// LLVM:         store float %[[DIV]], ptr %[[X_ADDR]]
 
-// LLVM:         %[[A3:.*]] = load float, ptr %[[A_ADDR]]
+// LLVM:         %[[X2:.*]] = load float, ptr %[[X_ADDR]]
 // LLVM:         %[[B3:.*]] = load float, ptr %[[B_ADDR]]
-// LLVM:         fadd float %[[A3]], %[[B3]]
+// LLVM:         %[[ADD:.*]] = fadd float %[[X2]], %[[B3]]
+// LLVM:         store float %[[ADD]], ptr %[[X_ADDR]]
 
-// LLVM:         %[[A4:.*]] = load float, ptr %[[A_ADDR]]
+// LLVM:         %[[X3:.*]] = load float, ptr %[[X_ADDR]]
 // LLVM:         %[[B4:.*]] = load float, ptr %[[B_ADDR]]
-// LLVM:         fsub float %[[A4]], %[[B4]]
+// LLVM:         %[[SUB:.*]] = fsub float %[[X3]], %[[B4]]
+// LLVM:         store float %[[SUB]], ptr %[[X_ADDR]]
 
 // LLVM:         ret void
 
 // OGCG-LABEL: define{{.*}} void @_Z23testFloatingPointBinOpsff(float {{.*}} %a, float {{.*}} %b)
 // OGCG:         %a.addr = alloca float
 // OGCG:         %b.addr = alloca float
+// OGCG:         %x = alloca float
 // OGCG:         store float %a, ptr %a.addr
 // OGCG:         store float %b, ptr %b.addr
 
 // OGCG:         %[[A1:.*]] = load float, ptr %a.addr
 // OGCG:         %[[B1:.*]] = load float, ptr %b.addr
-// OGCG:         fmul float %[[A1]], %[[B1]]
+// OGCG:         %[[MUL:.*]] = fmul float %[[A1]], %[[B1]]
+// OGCG:         store float %[[MUL]], ptr %x
 
-// OGCG:         %[[A2:.*]] = load float, ptr %a.addr
+// OGCG:         %[[X1:.*]] = load float, ptr %x
 // OGCG:         %[[B2:.*]] = load float, ptr %b.addr
-// OGCG:         fdiv float %[[A2]], %[[B2]]
+// OGCG:         %[[DIV:.*]] = fdiv float %[[X1]], %[[B2]]
+// OGCG:         store float %[[DIV]], ptr %x
 
-// OGCG:         %[[A3:.*]] = load float, ptr %a.addr
+// OGCG:         %[[X2:.*]] = load float, ptr %x
 // OGCG:         %[[B3:.*]] = load float, ptr %b.addr
-// OGCG:         fadd float %[[A3]], %[[B3]]
+// OGCG:         %[[ADD:.*]] = fadd float %[[X2]], %[[B3]]
+// OGCG:         store float %[[ADD]], ptr %x
 
-// OGCG:         %[[A4:.*]] = load float, ptr %a.addr
+// OGCG:         %[[X3:.*]] = load float, ptr %x
 // OGCG:         %[[B4:.*]] = load float, ptr %b.addr
-// OGCG:         fsub float %[[A4]], %[[B4]]
+// OGCG:         %[[SUB:.*]] = fsub float %[[X3]], %[[B4]]
+// OGCG:         store float %[[SUB]], ptr %x
 
 // OGCG:         ret void
 
diff --git a/clang/test/CIR/CodeGen/bitfields.c b/clang/test/CIR/CodeGen/bitfields.c
index 10af7ad1ef5ee..7d86bc36a31e6 100644
--- a/clang/test/CIR/CodeGen/bitfields.c
+++ b/clang/test/CIR/CodeGen/bitfields.c
@@ -288,7 +288,7 @@ void unOp(S* s) {
 // CIR:   [[TMP1:%.*]] = cir.load align(8) [[TMP0]] : !cir.ptr<!cir.ptr<!rec_S>>, !cir.ptr<!rec_S>
 // CIR:   [[TMP2:%.*]] = cir.get_member [[TMP1]][0] {name = "d"} : !cir.ptr<!rec_S> -> !cir.ptr<!u64i>
 // CIR:   [[TMP3:%.*]] = cir.get_bitfield align(4) (#bfi_d, [[TMP2]] : !cir.ptr<!u64i>) -> !s32i
-// CIR:   [[TMP4:%.*]] = cir.unary(inc, [[TMP3]]) nsw : !s32i, !s32i
+// CIR:   [[TMP4:%.*]] = cir.inc nsw [[TMP3]] : !s32i
 // CIR:   cir.set_bitfield align(4) (#bfi_d, [[TMP2]] : !cir.ptr<!u64i>, [[TMP4]] : !s32i)
 
 // LLVM: define {{.*@unOp}}
diff --git a/clang/test/CIR/CodeGen/complex-builtins.cpp b/clang/test/CIR/CodeGen/complex-builtins.cpp
index 811af47a704f5..9a6ca7562ecf7 100644
--- a/clang/test/CIR/CodeGen/complex-builtins.cpp
+++ b/clang/test/CIR/CodeGen/complex-builtins.cpp
@@ -94,7 +94,7 @@ void foo4() {
 // CIR: %[[TMP:.*]] = cir.load{{.*}} %[[COMPLEX]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR: %[[IMAG_MINUS:.*]] = cir.minus %[[IMAG]] : !cir.float
 // CIR: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
 // CIR: cir.store{{.*}} %[[RESULT_VAL]], %[[RESULT]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
diff --git a/clang/test/CIR/CodeGen/complex-unary.cpp b/clang/test/CIR/CodeGen/complex-unary.cpp
index a8e434b903763..753b322c6597a 100644
--- a/clang/test/CIR/CodeGen/complex-unary.cpp
+++ b/clang/test/CIR/CodeGen/complex-unary.cpp
@@ -13,7 +13,7 @@ void foo() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
-// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex<!s32i>, !cir.complex<!s32i>
+// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.not %[[TMP]] : !cir.complex<!s32i>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>, ["a"]
@@ -21,7 +21,7 @@ void foo() {
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!s32i>>, !cir.complex<!s32i>
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!s32i> -> !s32i
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!s32i> -> !s32i
-// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !s32i, !s32i
+// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.minus %[[IMAG]] : !s32i
 // CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !s32i -> !cir.complex<!s32i>
 // CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex<!s32i>, !cir.ptr<!cir.complex<!s32i>>
 
@@ -55,7 +55,7 @@ void foo2() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.unary(not, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_NOT:.*]] = cir.not %[[TMP]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_NOT]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
@@ -63,7 +63,7 @@ void foo2() {
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.minus %[[IMAG]] : !cir.float
 // CIR-AFTER: %[[RESULT_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[RESULT_VAL]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -97,7 +97,7 @@ void foo3() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.inc %[[TMP]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -106,7 +106,7 @@ void foo3() {
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[REAL_INC:.*]] = cir.inc %[[REAL]] : !cir.float
 // CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
@@ -146,7 +146,7 @@ void foo4() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.unary(inc, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_INC:.*]] = cir.inc %[[TMP]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_INC]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -155,7 +155,7 @@ void foo4() {
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[REAL_INC:.*]] = cir.unary(inc, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[REAL_INC:.*]] = cir.inc %[[REAL]] : !cir.float
 // CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_INC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
@@ -195,7 +195,7 @@ void foo5() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.dec %[[TMP]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -204,7 +204,7 @@ void foo5() {
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[REAL_DEC:.*]] = cir.dec %[[REAL]] : !cir.float
 // CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
@@ -244,7 +244,7 @@ void foo6() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.unary(dec, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_DEC:.*]] = cir.dec %[[TMP]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_DEC]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -253,7 +253,7 @@ void foo6() {
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[REAL_DEC:.*]] = cir.unary(dec, %[[REAL]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[REAL_DEC:.*]] = cir.dec %[[REAL]] : !cir.float
 // CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_DEC]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 // CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
@@ -293,27 +293,17 @@ void foo7() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[COMPLEX_PLUS:.*]] = cir.unary(plus, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
-// CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_PLUS]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-AFTER: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[REAL_PLUS:.*]] = cir.unary(plus, %[[REAL]]) : !cir.float, !cir.float
-// CIR-AFTER: %[[IMAG_PLUS:.*]] = cir.unary(plus, %[[IMAG]]) : !cir.float, !cir.float
-// CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_PLUS]], %[[IMAG_PLUS]] : !cir.float -> !cir.complex<!cir.float>
-// CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-AFTER: cir.store{{.*}} %[[TMP]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
 // LLVM: %[[B_ADDR:.*]] = alloca { float, float }, i64 1, align 4
 // LLVM: %[[TMP:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
-// LLVM: %[[REAL:.*]] = extractvalue { float, float } %[[TMP]], 0
-// LLVM: %[[IMAG:.*]] = extractvalue { float, float } %[[TMP]], 1
-// LLVM: %[[RESULT_TMP:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0
-// LLVM: %[[RESULT_VAL:.*]] = insertvalue { float, float } %[[RESULT_TMP]], float %[[IMAG]], 1
-// LLVM: store { float, float } %[[RESULT_VAL]], ptr %[[B_ADDR]], align 4
+// LLVM: store { float, float } %[[TMP]], ptr %[[B_ADDR]], align 4
 
 // OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
 // OGCG: %[[B_ADDR:.*]] = alloca { float, float }, align 4
@@ -334,7 +324,7 @@ void foo8() {
 // CIR-BEFORE: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["b", init]
 // CIR-BEFORE: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[COMPLEX_MINUS:.*]] = cir.unary(minus, %[[TMP]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[COMPLEX_MINUS:.*]] = cir.minus %[[TMP]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[COMPLEX_MINUS]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
@@ -342,8 +332,8 @@ void foo8() {
 // CIR-AFTER: %[[TMP:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[REAL:.*]] = cir.complex.real %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[IMAG:.*]] = cir.complex.imag %[[TMP]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[REAL_MINUS:.*]] = cir.unary(minus, %[[REAL]]) : !cir.float, !cir.float
-// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.unary(minus, %[[IMAG]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[REAL_MINUS:.*]] = cir.minus %[[REAL]] : !cir.float
+// CIR-AFTER: %[[IMAG_MINUS:.*]] = cir.minus %[[IMAG]] : !cir.float
 // CIR-AFTER: %[[NEW_COMPLEX:.*]] = cir.complex.create %[[REAL_MINUS]], %[[IMAG_MINUS]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[NEW_COMPLEX]], %[[B_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -381,8 +371,7 @@ void foo9() {
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["b", init]
 // CIR-BEFORE: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR-BEFORE: %[[A_COMPLEX_F32:.*]] = cir.cast float_complex %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.complex<!cir.float>
-// CIR-BEFORE: %[[RESULT:.*]] = cir.unary(plus, %[[A_COMPLEX_F32]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[A_COMPLEX_F16:.*]] = cir.cast float_complex %[[RESULT]] : !cir.complex<!cir.float> -> !cir.complex<!cir.f16>
+// CIR-BEFORE: %[[A_COMPLEX_F16:.*]] = cir.cast float_complex %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.complex<!cir.f16>
 // CIR-BEFORE: cir.store{{.*}} %[[A_COMPLEX_F16]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["a"]
@@ -393,13 +382,8 @@ void foo9() {
 // CIR-AFTER: %[[A_REAL_F32:.*]] = cir.cast floating %[[A_REAL]] : !cir.f16 -> !cir.float
 // CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.cast floating %[[A_IMAG]] : !cir.f16 -> !cir.float
 // CIR-AFTER: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
-// CIR-AFTER: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.unary(plus, %[[A_REAL_F32]]) : !cir.float, !cir.float
-// CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.unary(plus, %[[A_IMAG_F32]]) : !cir.float, !cir.float
-// CIR-AFTER: %[[RESULT_COMPLEX_F32:.*]] = cir.complex.create %[[RESULT_REAL_F32]], %[[RESULT_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
-// CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.complex.real %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.complex.imag %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
+// CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[RESULT_REAL_F16:.*]] = cir.cast floating %[[RESULT_REAL_F32]] : !cir.float -> !cir.f16
 // CIR-AFTER: %[[RESULT_IMAG_F16:.*]] = cir.cast floating %[[RESULT_IMAG_F32]] : !cir.float -> !cir.f16
 // CIR-AFTER: %[[RESULT_COMPLEX_F16:.*]] = cir.complex.create %[[RESULT_REAL_F16]], %[[RESULT_IMAG_F16]] : !cir.f16 -> !cir.complex<!cir.f16>
@@ -412,10 +396,6 @@ void foo9() {
 // LLVM: %[[A_IMAG:.*]] = extractvalue { half, half } %[[TMP_A]], 1
 // LLVM: %[[A_REAL_F32:.*]] = fpext half %[[A_REAL]] to float
 // LLVM: %[[A_IMAG_F32:.*]] = fpext half %[[A_IMAG]] to float
-// LLVM: %[[TMP_A_COMPLEX_F32:.*]] = insertvalue { float, float } {{.*}}, float %[[A_REAL_F32]], 0
-// LLVM: %[[A_COMPLEX_F32:.*]] = insertvalue { float, float } %[[TMP_A_COMPLEX_F32]], float %[[A_IMAG_F32]], 1
-// LLVM: %[[TMP_A_COMPLEX_F32:.*]] = insertvalue { float, float } {{.*}}, float %[[A_REAL_F32]], 0
-// LLVM: %[[A_COMPLEX_F32:.*]] = insertvalue { float, float } %[[TMP_A_COMPLEX_F32]], float %[[A_IMAG_F32]], 1
 // LLVM: %[[A_REAL_F16:.*]] = fptrunc float %[[A_REAL_F32]] to half
 // LLVM: %[[A_IMAG_F16:.*]] = fptrunc float %[[A_IMAG_F32]] to half
 // LLVM: %[[TMP_RESULT_COMPLEX_F16:.*]] = insertvalue { half, half } {{.*}}, half %[[A_REAL_F16]], 0
@@ -446,7 +426,7 @@ void foo10() {
 // CIR-BEFORE: %[[B_ADDR:.*]] = cir.alloca !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>, ["b", init]
 // CIR-BEFORE: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.f16>>, !cir.complex<!cir.f16>
 // CIR-BEFORE: %[[A_COMPLEX_F32:.*]] = cir.cast float_complex %[[TMP_A]] : !cir.complex<!cir.f16> -> !cir.complex<!cir.float>
-// CIR-BEFORE: %[[RESULT:.*]] = cir.unary(minus, %[[A_COMPLEX_F32]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[RESULT:.*]] = cir.minus %[[A_COMPLEX_F32]] : !cir.complex<!cir.float>
 // CIR-BEFORE: %[[A_COMPLEX_F16:.*]] = cir.cast float_complex %[[RESULT]] : !cir.complex<!cir.float> -> !cir.complex<!cir.f16>
 // CIR-BEFORE: cir.store{{.*}} %[[A_COMPLEX_F16]], %[[B_ADDR]] : !cir.complex<!cir.f16>, !cir.ptr<!cir.complex<!cir.f16>>
 
@@ -460,8 +440,8 @@ void foo10() {
 // CIR-AFTER: %[[A_COMPLEX_F32:.*]] = cir.complex.create %[[A_REAL_F32]], %[[A_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: %[[A_REAL_F32:.*]] = cir.complex.real %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[A_IMAG_F32:.*]] = cir.complex.imag %[[A_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.unary(minus, %[[A_REAL_F32]]) : !cir.float, !cir.float
-// CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.unary(minus, %[[A_IMAG_F32]]) : !cir.float, !cir.float
+// CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.minus %[[A_REAL_F32]] : !cir.float
+// CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.minus %[[A_IMAG_F32]] : !cir.float
 // CIR-AFTER: %[[RESULT_COMPLEX_F32:.*]] = cir.complex.create %[[RESULT_REAL_F32]], %[[RESULT_IMAG_F32]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: %[[RESULT_REAL_F32:.*]] = cir.complex.real %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[RESULT_IMAG_F32:.*]] = cir.complex.imag %[[RESULT_COMPLEX_F32]] : !cir.complex<!cir.float> -> !cir.float
@@ -514,14 +494,14 @@ void complex_unary_inc_lvalue() {
 
 // CIR-BEFORE: %[[A_ADDR]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[RESULT:.*]] = cir.unary(inc, %[[TMP_A]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[RESULT:.*]] = cir.inc %[[TMP_A]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-AFTER: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[RESULT_REAL:.*]] = cir.unary(inc, %2) : !cir.float, !cir.float
+// CIR-AFTER: %[[RESULT_REAL:.*]] = cir.inc %2 : !cir.float
 // CIR-AFTER: %[[RESULT:.*]] = cir.complex.create %[[RESULT_REAL]], %[[A_IMAG]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
@@ -552,14 +532,14 @@ void complex_unary_dec_lvalue() {
 
 // CIR-BEFORE: %[[A_ADDR]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-BEFORE: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR-BEFORE: %[[RESULT:.*]] = cir.unary(dec, %[[TMP_A]]) : !cir.complex<!cir.float>, !cir.complex<!cir.float>
+// CIR-BEFORE: %[[RESULT:.*]] = cir.dec %[[TMP_A]] : !cir.complex<!cir.float>
 // CIR-BEFORE: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
 // CIR-AFTER: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR-AFTER: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
 // CIR-AFTER: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR-AFTER: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR-AFTER: %[[RESULT_REAL:.*]] = cir.unary(dec, %2) : !cir.float, !cir.float
+// CIR-AFTER: %[[RESULT_REAL:.*]] = cir.dec %2 : !cir.float
 // CIR-AFTER: %[[RESULT:.*]] = cir.complex.create %[[RESULT_REAL]], %[[A_IMAG]] : !cir.float -> !cir.complex<!cir.float>
 // CIR-AFTER: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
 
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index d91083f0513cc..f65f0eea7fbaa 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1034,21 +1034,13 @@ void real_on_non_glvalue() {
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_PLUS:.*]] = cir.unary(plus, %[[A_REAL]]) : !cir.float, !cir.float
-// CIR: %[[A_IMAG_PLUS:.*]] = cir.unary(plus, %[[A_IMAG]]) : !cir.float, !cir.float
-// CIR: %[[RESULT:.*]] = cir.complex.create %[[A_REAL_PLUS]], %[[A_IMAG_PLUS]] : !cir.float -> !cir.complex<!cir.float>
-// CIR: %[[RESULT_REAL:.*]] = cir.complex.real %[[RESULT]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[RESULT_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: cir.store{{.*}} %[[RESULT_REAL]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
 // LLVM: %[[B_ADDR:.*]] = alloca float, i64 1, align 4
 // LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
 // LLVM: %[[A_REAL:.*]] = extractvalue { float, float } %[[TMP_A]], 0
-// LLVM: %[[A_IMAG:.*]] = extractvalue { float, float } %[[TMP_A]], 1
-// LLVM: %[[TMP_RESULT:.*]] = insertvalue { float, float } {{.*}}, float %[[A_REAL]], 0
-// LLVM: %[[RESULT:.*]] = insertvalue { float, float } %[[TMP_RESULT]], float %[[A_IMAG]], 1
 // LLVM: store float %[[A_REAL]], ptr %[[B_ADDR]], align 4
 
 // OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
@@ -1067,21 +1059,13 @@ void imag_on_non_glvalue() {
 // CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a"]
 // CIR: %[[B_ADDR:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["b", init]
 // CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.complex<!cir.float>>, !cir.complex<!cir.float>
-// CIR: %[[A_REAL:.*]] = cir.complex.real %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
-// CIR: %[[A_REAL_PLUS:.*]] = cir.unary(plus, %[[A_REAL]]) : !cir.float, !cir.float
-// CIR: %[[A_IMAG_PLUS:.*]] = cir.unary(plus, %[[A_IMAG]]) : !cir.float, !cir.float
-// CIR: %[[RESULT:.*]] = cir.complex.create %[[A_REAL_PLUS]], %[[A_IMAG_PLUS]] : !cir.float -> !cir.complex<!cir.float>
-// CIR: %[[RESULT_IMAG:.*]] = cir.complex.imag %[[RESULT]] : !cir.complex<!cir.float> -> !cir.float
+// CIR: %[[RESULT_IMAG:.*]] = cir.complex.imag %[[TMP_A]] : !cir.complex<!cir.float> -> !cir.float
 // CIR: cir.store{{.*}} %[[RESULT_IMAG]], %[[B_ADDR]] : !cir.float, !cir.ptr<!cir.float>
 
 // LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
 // LLVM: %[[B_ADDR:.*]] = alloca float, i64 1, align 4
 // LLVM: %[[TMP_A:.*]] = load { float, float }, ptr %[[A_ADDR]], align 4
-// LLVM: %[[A_REAL:.*]] = extractvalue { float, float } %[[TMP_A]], 0
 // LLVM: %[[A_IMAG:.*]] = extractvalue { float, float } %[[TMP_A]], 1
-// LLVM: %[[TMP_RESULT:.*]] = insertvalue { float, float } {{.*}}, float %[[A_REAL]], 0
-// LLVM: %[[RESULT:.*]] = insertvalue { float, float } %[[TMP_RESULT]], float %[[A_IMAG]], 1
 // LLVM: store float %[[A_IMAG]], ptr %[[B_ADDR]], align 4
 
 // OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
diff --git a/clang/test/CIR/CodeGen/coro-task.cpp b/clang/test/CIR/CodeGen/coro-task.cpp
index 637b058443bc7..b52f0f1871079 100644
--- a/clang/test/CIR/CodeGen/coro-task.cpp
+++ b/clang/test/CIR/CodeGen/coro-task.cpp
@@ -569,8 +569,7 @@ folly::coro::Task<int> go4() {
 
 // Get the lambda invoker ptr via `lambda operator folly::coro::Task<int> (*)(int const&)()`
 // CIR: %[[INVOKER:.*]] = cir.call @_ZZ3go4vENK3$_0cvPFN5folly4coro4TaskIiEERKiEEv(%{{.*}}) nothrow : {{.*}} -> (!cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>> {llvm.noundef})
-// CIR: %[[PLUS:.*]] = cir.unary(plus, %[[INVOKER]]) : !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>, !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>
-// CIR: cir.store{{.*}} %[[PLUS]], %[[FN_ADDR:.*]] : !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>, !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>>
+// CIR: cir.store{{.*}} %[[INVOKER]], %[[FN_ADDR:.*]] : !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>, !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>>
 // CIR: cir.scope {
 // CIR:   %[[ARG:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["ref.tmp2", init] {alignment = 4 : i64}
 // CIR:   %[[FN:.*]] = cir.load{{.*}} %[[FN_ADDR]] : !cir.ptr<!cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>>, !cir.ptr<!cir.func<(!cir.ptr<!s32i>) -> ![[IntTask]]>>
diff --git a/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
index c743cdc4845a3..f8eece43a0509 100644
--- a/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
+++ b/clang/test/CIR/CodeGen/cxx-rewritten-binary-operator.cpp
@@ -19,7 +19,7 @@ void cxx_rewritten_binary_operator_scalar_expr() {
 // CIR: %[[B_ADDR:.*]] = cir.alloca !rec_HasOpEq, !cir.ptr<!rec_HasOpEq>, ["b"]
 // CIR: %[[NEQ_ADDR:.*]] = cir.alloca !cir.bool, !cir.ptr<!cir.bool>, ["neq", init]
 // CIR: %[[EQ:.*]] = cir.call @_ZNK7HasOpEqeqERKS_(%[[A_ADDR]], %[[B_ADDR]]) : (!cir.ptr<!rec_HasOpEq> {{.*}}, !cir.ptr<!rec_HasOpEq> {{.*}}) -> (!cir.bool{{.*}})
-// CIR: %[[NEQ:.*]] = cir.unary(not, %[[EQ]]) : !cir.bool, !cir.bool
+// CIR: %[[NEQ:.*]] = cir.not %[[EQ]] : !cir.bool
 // CIR: cir.store{{.*}} %[[NEQ]], %[[NEQ_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 
 // LLVM: %[[A_ADDR:.*]] = alloca %struct.HasOpEq, i64 1, align 1
diff --git a/clang/test/CIR/CodeGen/dynamic-cast-exact.cpp b/clang/test/CIR/CodeGen/dynamic-cast-exact.cpp
index 951e8cf8f331d..39f0dd45b9de6 100644
--- a/clang/test/CIR/CodeGen/dynamic-cast-exact.cpp
+++ b/clang/test/CIR/CodeGen/dynamic-cast-exact.cpp
@@ -83,7 +83,7 @@ Derived &ref_cast(Base1 &ref) {
 // CIR-NEXT:   %[[SRC_VPTR_PTR:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_Base1> -> !cir.ptr<!cir.vptr>
 // CIR-NEXT:   %[[SRC_VPTR:.*]] = cir.load{{.*}} %[[SRC_VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
 // CIR-NEXT:   %[[SUCCESS:.*]] = cir.cmp eq %[[SRC_VPTR]], %[[EXPECTED_VPTR]] : !cir.vptr
-// CIR-NEXT:   %[[FAILED:.*]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+// CIR-NEXT:   %[[FAILED:.*]] = cir.not %[[SUCCESS]] : !cir.bool
 // CIR-NEXT:   cir.if %[[FAILED]] {
 // CIR-NEXT:     cir.call @__cxa_bad_cast() : () -> ()
 // CIR-NEXT:     cir.unreachable
diff --git a/clang/test/CIR/CodeGen/fold-during-cg.c b/clang/test/CIR/CodeGen/fold-during-cg.c
index 166efe893d682..137d5bb3d4f20 100644
--- a/clang/test/CIR/CodeGen/fold-during-cg.c
+++ b/clang/test/CIR/CodeGen/fold-during-cg.c
@@ -18,19 +18,19 @@ void fold_int_not() {
 
   n = ~0;
   // CIR: %[[MINUS_ONE:.*]] = cir.const #cir.int<-1> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.not
   // CIR: cir.store{{.*}} %[[MINUS_ONE]], %{{.*}}
   // LLVM_OGCG: store i32 -1, ptr %{{.*}}
 
   n = ~1;
   // CIR: %[[MINUS_TWO:.*]] = cir.const #cir.int<-2> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.not
   // CIR: cir.store{{.*}} %[[MINUS_TWO]], %{{.*}}
   // LLVM_OGCG: store i32 -2, ptr %{{.*}}
 
   n = ~0xFFFFFFFE;
   // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.not
   // CIR: cir.store{{.*}} %[[ONE]], %{{.*}}
   // LLVM_OGCG: store i32 1, ptr %{{.*}}
 }
@@ -43,73 +43,73 @@ void fold_int_plus() {
 
   n = +1;
   // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[ONE]], %{{.*}}
   // LLVM_OGCG: store i32 1, ptr %{{.*}}
 
   n = +2;
   // CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[TWO]], %{{.*}}
   // LLVM_OGCG: store i32 2, ptr %{{.*}}
 
   n = +(-3);
   // CIR: %[[MINUS_THREE:.*]] = cir.const #cir.int<-3> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_THREE]], %{{.*}}
   // LLVM_OGCG: store i32 -3, ptr %{{.*}}
 
   n = +(0x1FFFFFFFF);
   // CIR: %[[MINUS_ONE:.*]] = cir.const #cir.int<-1> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_ONE]], %{{.*}}
   // LLVM_OGCG: store i32 -1, ptr %{{.*}}
 
   s = +1;
   // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[ONE]], %{{.*}}
   // LLVM_OGCG: store i16 1, ptr %{{.*}}
 
   s = +2;
   // CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[TWO]], %{{.*}}
   // LLVM_OGCG: store i16 2, ptr %{{.*}}
 
   s = +(-3);
   // CIR: %[[MINUS_THREE:.*]] = cir.const #cir.int<-3> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_THREE]], %{{.*}}
   // LLVM_OGCG: store i16 -3, ptr %{{.*}}
 
   s = +(0x1FFFF);
   // CIR: %[[MINUS_ONE:.*]] = cir.const #cir.int<-1> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_ONE]], %{{.*}}
   // LLVM_OGCG: store i16 -1, ptr %{{.*}}
 
   u = +1;
   // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[ONE]], %{{.*}}
   // LLVM_OGCG: store i32 1, ptr %{{.*}}
 
   u = +2;
   // CIR: %[[TWO:.*]] = cir.const #cir.int<2> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[TWO]], %{{.*}}
   // LLVM_OGCG: store i32 2, ptr %{{.*}}
 
   u = +(-3);
   // CIR: %[[MINUS_THREE:.*]] = cir.const #cir.int<4294967293> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_THREE]], %{{.*}}
   // LLVM_OGCG: store i32 -3, ptr %{{.*}}
 
   u = +(0x1FFFFFFFF);
   // CIR: %[[MINUS_ONE:.*]] = cir.const #cir.int<4294967295> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_ONE]], %{{.*}}
   // LLVM_OGCG: store i32 -1, ptr %{{.*}}
 }
@@ -122,76 +122,76 @@ void fold_int_minus() {
 
   n = -1;
   // CIR: %[[MINUS_ONE:.*]] = cir.const #cir.int<-1> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_ONE]], %{{.*}}
   // LLVM_OGCG: store i32 -1, ptr %{{.*}}
 
   n = -2;
   // CIR: %[[MINUS_TWO:.*]] = cir.const #cir.int<-2> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_TWO]], %{{.*}}
   // LLVM_OGCG: store i32 -2, ptr %{{.*}}
 
   n = -(-3);
   // CIR-DUP: %[[UNUSED_THREE:.*]] = cir.const #cir.int<3> : !s32i
   // CIR: %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[THREE]], %{{.*}}
   // LLVM_OGCG: store i32 3, ptr %{{.*}}
 
   n = -(0x1FFFFFFFF);
   // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[ONE]], %{{.*}}
   // LLVM_OGCG: store i32 1, ptr %{{.*}}
 
   s = -1;
   // CIR: %[[MINUS_ONE:.*]] = cir.const #cir.int<-1> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_ONE]], %{{.*}}
   // LLVM_OGCG: store i16 -1, ptr %{{.*}}
 
   s = -2;
   // CIR: %[[MINUS_TWO:.*]] = cir.const #cir.int<-2> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_TWO]], %{{.*}}
   // LLVM_OGCG: store i16 -2, ptr %{{.*}}
 
   s = -(-3);
   // CIR-DUP: %[[UNUSED_THREE:.*]] = cir.const #cir.int<3> : !s32i
   // CIR: %[[THREE:.*]] = cir.const #cir.int<3> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[THREE]], %{{.*}}
   // LLVM_OGCG: store i16 3, ptr %{{.*}}
 
   s = -(0x1FFFF);
   // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !s16i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[ONE]], %{{.*}}
   // LLVM_OGCG: store i16 1, ptr %{{.*}}
 
   u = -1;
   // CIR: %[[MINUS_ONE:.*]] = cir.const #cir.int<4294967295> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_ONE]], %{{.*}}
   // LLVM_OGCG: store i32 -1, ptr %{{.*}}
 
   u = -2;
   // CIR: %[[MINUS_TWO:.*]] = cir.const #cir.int<4294967294> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_TWO]], %{{.*}}
   // LLVM_OGCG: store i32 -2, ptr %{{.*}}
 
   u = -(-3);
   // CIR-DUP: %[[UNUSED_THREE:.*]] = cir.const #cir.int<3> : !s32i
   // CIR: %[[THREE:.*]] = cir.const #cir.int<3> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[THREE]], %{{.*}}
   // LLVM_OGCG: store i32 3, ptr %{{.*}}
 
   u = -(0x1FFFFFFFF);
   // CIR: %[[ONE:.*]] = cir.const #cir.int<1> : !u32i
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[ONE]], %{{.*}}
   // LLVM_OGCG: store i32 1, ptr %{{.*}}
 }
@@ -203,25 +203,25 @@ void fold_float_plus() {
 
   f = +2.0f;
   // CIR: %[[TWO:.*]] = cir.const #cir.fp<2.000000e+00> : !cir.float
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[TWO]], %{{.*}}
   // LLVM_OGCG: store float 2.000000e+00, ptr %{{.*}}
 
   f = +(-3.0f);
   // CIR: %[[MINUS_THREE:.*]] = cir.const #cir.fp<-3.000000e+00> : !cir.float
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_THREE]], %{{.*}}
   // LLVM_OGCG: store float -3.000000e+00, ptr %{{.*}}
 
   d = +2.0;
   // CIR: %[[TWO:.*]] = cir.const #cir.fp<2.000000e+00> : !cir.double
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[TWO]], %{{.*}}
   // LLVM_OGCG: store double 2.000000e+00, ptr %{{.*}}
 
   d = +(-3.0);
   // CIR: %[[MINUS_THREE:.*]] = cir.const #cir.fp<-3.000000e+00> : !cir.double
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.plus
   // CIR: cir.store{{.*}} %[[MINUS_THREE]], %{{.*}}
   // LLVM_OGCG: store double -3.000000e+00, ptr %{{.*}}
 }
@@ -233,27 +233,27 @@ void fold_float_minus() {
 
   f = -2.0f;
   // CIR: %[[MINUS_TWO:.*]] = cir.const #cir.fp<-2.000000e+00> : !cir.float
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_TWO]], %{{.*}}
   // LLVM_OGCG: store float -2.000000e+00, ptr %{{.*}}
 
   f = -(-3.0f);
   // CIR-DUP: %[[UNUSED_THREE:.*]] = cir.const #cir.fp<3.000000e+00> : !cir.float
   // CIR: %[[THREE:.*]] = cir.const #cir.fp<3.000000e+00> : !cir.float
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[THREE]], %{{.*}}
   // LLVM_OGCG: store float 3.000000e+00, ptr %{{.*}}
 
   d = -2.0;
   // CIR: %[[MINUS_TWO:.*]] = cir.const #cir.fp<-2.000000e+00> : !cir.double
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[MINUS_TWO]], %{{.*}}
   // LLVM_OGCG: store double -2.000000e+00, ptr %{{.*}}
 
   d = -(-3.0);
   // CIR-DUP: %[[UNUSED_THREE:.*]] = cir.const #cir.fp<3.000000e+00> : !cir.double
   // CIR: %[[THREE:.*]] = cir.const #cir.fp<3.000000e+00> : !cir.double
-  // CIR-NOT: cir.unary
+  // CIR-NOT: cir.minus
   // CIR: cir.store{{.*}} %[[THREE]], %{{.*}}
   // LLVM_OGCG: store double 3.000000e+00, ptr %{{.*}}
 }
diff --git a/clang/test/CIR/CodeGen/integer-overflow.c b/clang/test/CIR/CodeGen/integer-overflow.c
index 119c5a917698a..ad4de2da7a769 100644
--- a/clang/test/CIR/CodeGen/integer-overflow.c
+++ b/clang/test/CIR/CodeGen/integer-overflow.c
@@ -43,28 +43,28 @@ void test1(void) {
   // WRAPV-OGCG: mul i32
   f11G = a * b;
 
-  // DEFAULT-CIR:  cir.unary(minus, {{.*}}) nsw : !s32i
+  // DEFAULT-CIR:  cir.minus nsw {{.*}} : !s32i
   // DEFAULT-LLVM: sub nsw i32 0, 
   // DEFAULT-OGCG: sub nsw i32 0, 
-  // WRAPV-CIR:  cir.unary(minus, {{.*}}) : !s32i
+  // WRAPV-CIR:  cir.minus {{.*}} : !s32i
   // WRAPV-LLVM: sub i32 0, 
   // WRAPV-OGCG: sub i32 0, 
   f11G = -a;
   
   // PR7426 - Overflow checking for increments.
   
-  // DEFAULT-CIR:  cir.unary(inc, {{.*}}) nsw : !s32i
+  // DEFAULT-CIR:  cir.inc nsw {{.*}} : !s32i
   // DEFAULT-LLVM: add nsw i32 {{.*}}, 1
   // DEFAULT-OGCG: add nsw i32 {{.*}}, 1
-  // WRAPV-CIR:  cir.unary(inc, {{.*}}) : !s32i
+  // WRAPV-CIR:  cir.inc {{.*}} : !s32i
   // WRAPV-LLVM: add i32 {{.*}}, 1
   // WRAPV-OGCG: add i32 {{.*}}, 1
   ++a;
   
-  // DEFAULT-CIR:  cir.unary(dec, {{.*}}) nsw : !s32i
+  // DEFAULT-CIR:  cir.dec nsw {{.*}} : !s32i
   // DEFAULT-LLVM: sub nsw i32 {{.*}}, 1
   // DEFAULT-OGCG: add nsw i32 {{.*}}, -1
-  // WRAPV-CIR:  cir.unary(dec, {{.*}}) : !s32i
+  // WRAPV-CIR:  cir.dec {{.*}} : !s32i
   // WRAPV-LLVM: sub i32 {{.*}}, 1
   // WRAPV-OGCG: add i32 {{.*}}, -1
   --a;
@@ -84,40 +84,40 @@ void test1(void) {
 
   // PR9350: char pre-increment never overflows.
   extern volatile signed char PR9350_char_inc;
-  // DEFAULT-CIR:  cir.unary(inc, {{.*}}) : !s8i
+  // DEFAULT-CIR:  cir.inc {{.*}} : !s8i
   // DEFAULT-LLVM: add i8 {{.*}}, 1
   // DEFAULT-OGCG: add i8 {{.*}}, 1
-  // WRAPV-CIR:  cir.unary(inc, {{.*}}) : !s8i
+  // WRAPV-CIR:  cir.inc {{.*}} : !s8i
   // WRAPV-LLVM: add i8 {{.*}}, 1
   // WRAPV-OGCG: add i8 {{.*}}, 1
   ++PR9350_char_inc;
 
   // PR9350: char pre-decrement never overflows.
   extern volatile signed char PR9350_char_dec;
-  // DEFAULT-CIR:  cir.unary(dec, {{.*}}) : !s8i
+  // DEFAULT-CIR:  cir.dec {{.*}} : !s8i
   // DEFAULT-LLVM: sub i8 {{.*}}, 1
   // DEFAULT-OGCG: add i8 {{.*}}, -1
-  // WRAPV-CIR:  cir.unary(dec, {{.*}}) : !s8i
+  // WRAPV-CIR:  cir.dec {{.*}} : !s8i
   // WRAPV-LLVM: sub i8 {{.*}}, 1
   // WRAPV-OGCG: add i8 {{.*}}, -1
   --PR9350_char_dec;
 
   // PR9350: short pre-increment never overflows.
   extern volatile signed short PR9350_short_inc;
-  // DEFAULT-CIR:  cir.unary(inc, {{.*}}) : !s16i
+  // DEFAULT-CIR:  cir.inc {{.*}} : !s16i
   // DEFAULT-LLVM: add i16 {{.*}}, 1
   // DEFAULT-OGCG: add i16 {{.*}}, 1
-  // WRAPV-CIR:  cir.unary(inc, {{.*}}) : !s16i
+  // WRAPV-CIR:  cir.inc {{.*}} : !s16i
   // WRAPV-LLVM: add i16 {{.*}}, 1
   // WRAPV-OGCG: add i16 {{.*}}, 1
   ++PR9350_short_inc;
 
   // PR9350: short pre-decrement never overflows.
   extern volatile signed short PR9350_short_dec;
-  // DEFAULT-CIR:  cir.unary(dec, {{.*}}) : !s16i
+  // DEFAULT-CIR:  cir.dec {{.*}} : !s16i
   // DEFAULT-LLVM: sub i16 {{.*}}, 1
   // DEFAULT-OGCG: add i16 {{.*}}, -1
-  // WRAPV-CIR:  cir.unary(dec, {{.*}}) : !s16i
+  // WRAPV-CIR:  cir.dec {{.*}} : !s16i
   // WRAPV-LLVM: sub i16 {{.*}}, 1
   // WRAPV-OGCG: add i16 {{.*}}, -1
   --PR9350_short_dec;
diff --git a/clang/test/CIR/CodeGen/label.c b/clang/test/CIR/CodeGen/label.c
index 93ed6c583b98a..db35168f61ec1 100644
--- a/clang/test/CIR/CodeGen/label.c
+++ b/clang/test/CIR/CodeGen/label.c
@@ -68,7 +68,7 @@ void label_in_if(int cond) {
 // CIR:      ^bb1:
 // CIR:        cir.label "labelD"
 // CIR:        [[LOAD:%.*]] = cir.load align(4) [[COND:%.*]] : !cir.ptr<!s32i>, !s32i
-// CIR:        [[INC:%.*]] = cir.unary(inc, %3) nsw : !s32i, !s32i
+// CIR:        [[INC:%.*]] = cir.inc nsw %3 : !s32i
 // CIR:        cir.store align(4) [[INC]], [[COND]] : !s32i, !cir.ptr<!s32i>
 // CIR:      }
 // CIR:    cir.return
diff --git a/clang/test/CIR/CodeGen/lambda-static-invoker.cpp b/clang/test/CIR/CodeGen/lambda-static-invoker.cpp
index 2b00feccbc79a..711249496bbba 100644
--- a/clang/test/CIR/CodeGen/lambda-static-invoker.cpp
+++ b/clang/test/CIR/CodeGen/lambda-static-invoker.cpp
@@ -121,10 +121,8 @@ int g3() {
 
 // 1. Use `operator int (*)(int const&)()` to retrieve the fnptr to `__invoke()`.
 // CIR:   %[[OPERATOR_RESULT:.*]] = cir.call @_ZZ2g3vENK3$_0cvPFiRKiEEv(%[[LAM_ALLOCA]]){{.*}}
-// CIR:   %[[PLUS:.*]] = cir.unary(plus, %[[OPERATOR_RESULT]])
-
 // 2. Load ptr to `__invoke()`.
-// CIR:   cir.store{{.*}} %[[PLUS]], %[[FN_ADDR]]
+// CIR:   cir.store{{.*}} %[[OPERATOR_RESULT]], %[[FN_ADDR]]
 // CIR:   %[[FN:.*]] = cir.load{{.*}} %[[FN_ADDR]]
 // CIR:   %[[THREE:.*]] = cir.const #cir.int<3> : !s32i
 // CIR:   cir.store{{.*}} %[[THREE]], %[[REF_TMP1]]
diff --git a/clang/test/CIR/CodeGen/new.cpp b/clang/test/CIR/CodeGen/new.cpp
index 81e513a5d7ac2..671de964e7d15 100644
--- a/clang/test/CIR/CodeGen/new.cpp
+++ b/clang/test/CIR/CodeGen/new.cpp
@@ -551,19 +551,14 @@ void t_new_var_size5(int n) {
   auto p = new double[n][2][3];
 }
 
-// NUM_ELEMENTS isn't used in this case because there is no cookie. It isn't
-// used in the allocation size because the allocation size is calculated with
-// the element size and the fixed size dimensions already combined (6 * 8 = 48).
-// We don't DCE NUM_ELEMENTS because it's not a constant, but later
-// optimizations will eliminate it.
+// The allocation size is calculated with the element size and the fixed size
+// dimensions already combined (6 * 8 = 48).
 
 // CHECK:  cir.func {{.*}} @_Z15t_new_var_size5i
 // CHECK:    %[[N:.*]] = cir.load{{.*}} %[[ARG_ALLOCA:.*]]
 // CHECK:    %[[N_SIZE_T:.*]] = cir.cast integral %[[N]] : !s32i -> !u64i
 // CHECK:    %[[ELEMENT_SIZE:.*]] = cir.const #cir.int<48> : !u64i
 // CHECK:    %[[RESULT:.*]], %[[OVERFLOW:.*]] = cir.binop.overflow(mul, %[[N_SIZE_T]], %[[ELEMENT_SIZE]]) : !u64i, (!u64i, !cir.bool)
-// CHECK:    %[[NUM_ELEMENTS_MULTIPLIER:.*]] = cir.const #cir.int<6>
-// CHECK:    %[[NUM_ELEMENTS:.*]] = cir.mul %[[N_SIZE_T]], %[[NUM_ELEMENTS_MULTIPLIER]] : !u64i
 // CHECK:    %[[ALL_ONES:.*]] = cir.const #cir.int<18446744073709551615> : !u64i
 // CHECK:    %[[ALLOC_SIZE:.*]] = cir.select if %[[OVERFLOW]] then %[[ALL_ONES]] else %[[RESULT]] : (!cir.bool, !u64i, !u64i)
 // CHECK:    %[[PTR:.*]] = cir.call @_Znam(%[[ALLOC_SIZE]]) {allocsize = array<i32: 0>, builtin} : (!u64i {llvm.noundef})
diff --git a/clang/test/CIR/CodeGen/nrvo.cpp b/clang/test/CIR/CodeGen/nrvo.cpp
index f68573e222606..a3af2d1ff032b 100644
--- a/clang/test/CIR/CodeGen/nrvo.cpp
+++ b/clang/test/CIR/CodeGen/nrvo.cpp
@@ -77,7 +77,7 @@ NonTrivial test_nrvo() {
 // CIR:     cir.return %[[RET]]
 // CIR:   } cleanup  normal {
 // CIR:     %[[NRVO_FLAG_VAL:.*]] = cir.load{{.*}} %[[NRVO_FLAG]]
-// CIR:     %[[NOT_NRVO_VAL:.*]] = cir.unary(not, %[[NRVO_FLAG_VAL]])
+// CIR:     %[[NOT_NRVO_VAL:.*]] = cir.not %[[NRVO_FLAG_VAL]]
 // CIR:     cir.if %[[NOT_NRVO_VAL]] {
 // CIR:       cir.call @_ZN10NonTrivialD1Ev(%[[RESULT]])
 // CIR:     }
diff --git a/clang/test/CIR/CodeGen/pointers.cpp b/clang/test/CIR/CodeGen/pointers.cpp
index b07c1b57c127d..d43c5691ebd49 100644
--- a/clang/test/CIR/CodeGen/pointers.cpp
+++ b/clang/test/CIR/CodeGen/pointers.cpp
@@ -28,7 +28,7 @@ void foo(int *iptr, char *cptr, unsigned ustride) {
   iptr - ustride;
   // CHECK: %[[#STRIDE:]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!u32i>, !u32i
   // CHECK: %[[#SIGNSTRIDE:]] = cir.cast integral %[[#STRIDE]] : !u32i -> !s32i
-  // CHECK: %[[#NEGSTRIDE:]] = cir.unary(minus, %[[#SIGNSTRIDE]]) : !s32i, !s32i
+  // CHECK: %[[#NEGSTRIDE:]] = cir.minus %[[#SIGNSTRIDE]] : !s32i
   // CHECK: cir.ptr_stride %{{.+}}, %[[#NEGSTRIDE]] : (!cir.ptr<!s32i>, !s32i) -> !cir.ptr<!s32i>
 
   4 + iptr;
diff --git a/clang/test/CIR/CodeGen/size-of-vla.cpp b/clang/test/CIR/CodeGen/size-of-vla.cpp
index 789e97f40440b..c6838155189ef 100644
--- a/clang/test/CIR/CodeGen/size-of-vla.cpp
+++ b/clang/test/CIR/CodeGen/size-of-vla.cpp
@@ -38,9 +38,9 @@ void vla_type_with_element_type_int() {
 // CIR: %[[SIZE_ADDR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["size", init]
 // CIR: %[[CONST_10:.*]] = cir.const #cir.int<10> : !u64i
 // CIR: cir.store {{.*}} %[[CONST_10]], %[[N_ADDR]] : !u64i, !cir.ptr<!u64i>
-// CIR: %3 = cir.load {{.*}} %[[N_ADDR]] : !cir.ptr<!u64i>, !u64i
+// CIR: %[[TMP_N:.*]] = cir.load {{.*}} %[[N_ADDR]] : !cir.ptr<!u64i>, !u64i
 // CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u64i
-// CIR: %[[SIZE:.*]] = cir.mul nuw %[[CONST_4]], %3 : !u64i
+// CIR: %[[SIZE:.*]] = cir.mul nuw %[[CONST_4]], %[[TMP_N]] : !u64i
 // CIR: cir.store {{.*}} %[[SIZE]], %[[SIZE_ADDR]] : !u64i, !cir.ptr<!u64i>
 
 // LLVM: %[[N_ADDR:.*]] = alloca i64, i64 1, align 8
diff --git a/clang/test/CIR/CodeGen/static-vars.c b/clang/test/CIR/CodeGen/static-vars.c
index ee4db82c40ade..96ef705d15326 100644
--- a/clang/test/CIR/CodeGen/static-vars.c
+++ b/clang/test/CIR/CodeGen/static-vars.c
@@ -24,7 +24,7 @@ void func1(void) {
   j++;
   // CHECK-DAG: %[[#V2:]] = cir.get_global @func1.j : !cir.ptr<!s32i>
   // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i
+  // CHECK-DAG: %[[#V4:]] = cir.inc nsw %[[#V3]] : !s32i
   // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr<!s32i>
 }
 
diff --git a/clang/test/CIR/CodeGen/static-vars.cpp b/clang/test/CIR/CodeGen/static-vars.cpp
index 4f22fc7abc541..4cc7b68baa01f 100644
--- a/clang/test/CIR/CodeGen/static-vars.cpp
+++ b/clang/test/CIR/CodeGen/static-vars.cpp
@@ -57,7 +57,7 @@ void func1(void) {
   j++;
   // CHECK-DAG: %[[#V2:]] = cir.get_global @_ZZ5func1vE1j : !cir.ptr<!s32i>
   // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i
+  // CHECK-DAG: %[[#V4:]] = cir.inc nsw %[[#V3]] : !s32i
   // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr<!s32i>
 }
 
diff --git a/clang/test/CIR/CodeGen/throws.cpp b/clang/test/CIR/CodeGen/throws.cpp
index f482fb48b0205..c4afb84f09f95 100644
--- a/clang/test/CIR/CodeGen/throws.cpp
+++ b/clang/test/CIR/CodeGen/throws.cpp
@@ -112,9 +112,7 @@ void paren_expr() { (throw 0, 1 + 2); }
 // CIR:   cir.throw %[[EXCEPTION_ADDR]] : !cir.ptr<!s32i>, @_ZTIi
 // CIR:   cir.unreachable
 // CIR: ^bb1:
-// CIR:   %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i
-// CIR:   %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i
-// CIR:   %[[ADD:.*]] = cir.add nsw %[[CONST_1]], %[[CONST_2]] : !s32i
+// CIR:   cir.return
 
 // LLVM: %[[EXCEPTION_ADDR:.*]] = call ptr @__cxa_allocate_exception(i64 4)
 // LLVM: store i32 0, ptr %[[EXCEPTION_ADDR]], align 16
diff --git a/clang/test/CIR/CodeGen/try-catch.cpp b/clang/test/CIR/CodeGen/try-catch.cpp
index f930d41c8e1b2..c62320fa783c4 100644
--- a/clang/test/CIR/CodeGen/try-catch.cpp
+++ b/clang/test/CIR/CodeGen/try-catch.cpp
@@ -48,7 +48,7 @@ void try_catch_with_empty_catch_all() {
 // CIR:     cir.return
 // CIR:   ^bb1:  // no predecessors
 // CIR:     %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
-// CIR:     %[[RESULT:.*]] = cir.unary(inc, %[[TMP_A]]) nsw : !s32i, !s32i
+// CIR:     %[[RESULT:.*]] = cir.inc nsw %[[TMP_A]] : !s32i
 // CIR:     cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
 // CIR:     cir.yield
 // CIR:   }
@@ -90,7 +90,7 @@ void try_catch_with_empty_catch_all_2() {
 // CIR: cir.scope {
 // CIR:   cir.try {
 // CIR:     %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
-// CIR:     %[[RESULT:.*]] = cir.unary(inc, %[[TMP_A]]) nsw : !s32i, !s32i
+// CIR:     %[[RESULT:.*]] = cir.inc nsw %[[TMP_A]] : !s32i
 // CIR:     cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
 // CIR:     cir.return
 // CIR:   }
diff --git a/clang/test/CIR/CodeGen/unary.cpp b/clang/test/CIR/CodeGen/unary.cpp
index cc6f2ade38f3f..4446fe5ceb1a4 100644
--- a/clang/test/CIR/CodeGen/unary.cpp
+++ b/clang/test/CIR/CodeGen/unary.cpp
@@ -13,7 +13,6 @@ unsigned up0() {
 // CHECK: cir.func{{.*}} @_Z3up0v() -> (!u32i{{.*}})
 // CHECK:   %[[A:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[OUTPUT:.*]] = cir.unary(plus, %[[INPUT]])
 
 // LLVM: define{{.*}} i32 @_Z3up0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
@@ -34,7 +33,7 @@ unsigned um0() {
 // CHECK: cir.func{{.*}} @_Z3um0v() -> (!u32i{{.*}})
 // CHECK:   %[[A:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[OUTPUT:.*]] = cir.unary(minus, %[[INPUT]])
+// CHECK:   %[[OUTPUT:.*]] = cir.minus %[[INPUT]]
 
 // LLVM: define{{.*}} i32 @_Z3um0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
@@ -57,7 +56,7 @@ unsigned un0() {
 // CHECK: cir.func{{.*}} @_Z3un0v() -> (!u32i{{.*}})
 // CHECK:   %[[A:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[OUTPUT:.*]] = cir.unary(not, %[[INPUT]])
+// CHECK:   %[[OUTPUT:.*]] = cir.not %[[INPUT]]
 
 // LLVM: define{{.*}} i32 @_Z3un0v()
 // LLVM:   %[[RV:.*]] = alloca i32, i64 1, align 4
@@ -83,7 +82,7 @@ int inc0() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]]) nsw
+// CHECK:   %[[INCREMENTED:.*]] = cir.inc nsw %[[INPUT]]
 // CHECK:   cir.store{{.*}} %[[INCREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
@@ -111,7 +110,7 @@ int dec0() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]]) nsw
+// CHECK:   %[[DECREMENTED:.*]] = cir.dec nsw %[[INPUT]]
 // CHECK:   cir.store{{.*}} %[[DECREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
@@ -139,7 +138,7 @@ int inc1() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]]) nsw
+// CHECK:   %[[INCREMENTED:.*]] = cir.inc nsw %[[INPUT]]
 // CHECK:   cir.store{{.*}} %[[INCREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
@@ -167,7 +166,7 @@ int dec1() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]]) nsw
+// CHECK:   %[[DECREMENTED:.*]] = cir.dec nsw %[[INPUT]]
 // CHECK:   cir.store{{.*}} %[[DECREMENTED]], %[[A]]
 // CHECK:   %[[A_TO_OUTPUT:.*]] = cir.load{{.*}} %[[A]]
 
@@ -197,7 +196,7 @@ int inc2() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !s32i
 // CHECK:   %[[ATOB:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[ATOB]]) nsw
+// CHECK:   %[[INCREMENTED:.*]] = cir.inc nsw %[[ATOB]]
 // CHECK:   cir.store{{.*}} %[[INCREMENTED]], %[[A]]
 // CHECK:   cir.store{{.*}} %[[ATOB]], %[[B]]
 // CHECK:   %[[B_TO_OUTPUT:.*]] = cir.load{{.*}} %[[B]]
@@ -231,7 +230,6 @@ float fpPlus() {
 // CHECK: cir.func{{.*}} @_Z6fpPlusv() -> (!cir.float{{.*}})
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[OUTPUT:.*]] = cir.unary(plus, %[[INPUT]])
 
 // LLVM: define{{.*}} float @_Z6fpPlusv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
@@ -252,7 +250,7 @@ float fpMinus() {
 // CHECK: cir.func{{.*}} @_Z7fpMinusv() -> (!cir.float{{.*}})
 // CHECK:   %[[A:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["a", init]
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[OUTPUT:.*]] = cir.unary(minus, %[[INPUT]])
+// CHECK:   %[[OUTPUT:.*]] = cir.minus %[[INPUT]]
 
 // LLVM: define{{.*}} float @_Z7fpMinusv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
@@ -277,7 +275,7 @@ float fpPreInc() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]])
+// CHECK:   %[[INCREMENTED:.*]] = cir.inc %[[INPUT]]
 
 // LLVM: define{{.*}} float @_Z8fpPreIncv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
@@ -302,7 +300,7 @@ float fpPreDec() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]])
+// CHECK:   %[[DECREMENTED:.*]] = cir.dec %[[INPUT]]
 
 // LLVM: define{{.*}} float @_Z8fpPreDecv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
@@ -327,7 +325,7 @@ float fpPostInc() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[INPUT]])
+// CHECK:   %[[INCREMENTED:.*]] = cir.inc %[[INPUT]]
 
 // LLVM: define{{.*}} float @_Z9fpPostIncv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
@@ -352,7 +350,7 @@ float fpPostDec() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[DECREMENTED:.*]] = cir.unary(dec, %[[INPUT]])
+// CHECK:   %[[DECREMENTED:.*]] = cir.dec %[[INPUT]]
 
 // LLVM: define{{.*}} float @_Z9fpPostDecv()
 // LLVM:   %[[RV:.*]] = alloca float, i64 1, align 4
@@ -380,7 +378,7 @@ float fpPostInc2() {
 // CHECK:   %[[ATMP:.*]] = cir.const #cir.fp<1.000000e+00> : !cir.float
 // CHECK:   cir.store{{.*}} %[[ATMP]], %[[A]] : !cir.float
 // CHECK:   %[[ATOB:.*]] = cir.load{{.*}} %[[A]]
-// CHECK:   %[[INCREMENTED:.*]] = cir.unary(inc, %[[ATOB]])
+// CHECK:   %[[INCREMENTED:.*]] = cir.inc %[[ATOB]]
 // CHECK:   cir.store{{.*}} %[[INCREMENTED]], %[[A]]
 // CHECK:   cir.store{{.*}} %[[ATOB]], %[[B]]
 // CHECK:   %[[B_TO_OUTPUT:.*]] = cir.load{{.*}} %[[B]]
@@ -411,18 +409,17 @@ void chars(char c) {
 
   int c1 = +c;
   // CHECK: %[[PROMO:.*]] = cir.cast integral %{{.+}} : !s8i -> !s32i
-  // CHECK: cir.unary(plus, %[[PROMO]]) : !s32i, !s32i
   int c2 = -c;
   // CHECK: %[[PROMO:.*]] = cir.cast integral %{{.+}} : !s8i -> !s32i
-  // CHECK: cir.unary(minus, %[[PROMO]]) nsw : !s32i, !s32i
+  // CHECK: cir.minus nsw %[[PROMO]] : !s32i
 
   // Chars can go through some integer promotion codegen paths even when not promoted.
   // These should not have nsw attributes because the intermediate promotion makes the
   // overflow defined behavior.
-  ++c; // CHECK: cir.unary(inc, %{{.+}}) : !s8i, !s8i
-  --c; // CHECK: cir.unary(dec, %{{.+}}) : !s8i, !s8i
-  c++; // CHECK: cir.unary(inc, %{{.+}}) : !s8i, !s8i
-  c--; // CHECK: cir.unary(dec, %{{.+}}) : !s8i, !s8i
+  ++c; // CHECK: cir.inc %{{.+}} : !s8i
+  --c; // CHECK: cir.dec %{{.+}} : !s8i
+  c++; // CHECK: cir.inc %{{.+}} : !s8i
+  c--; // CHECK: cir.dec %{{.+}} : !s8i
 }
 
 _Float16 fp16UPlus(_Float16 f) {
@@ -432,8 +429,7 @@ _Float16 fp16UPlus(_Float16 f) {
 // CHECK: cir.func{{.*}} @_Z9fp16UPlusDF16_({{.*}}) -> (!cir.f16{{.*}})
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[F:.*]]
 // CHECK:   %[[PROMOTED:.*]] = cir.cast floating %[[INPUT]] : !cir.f16 -> !cir.float
-// CHECK:   %[[RESULT:.*]] = cir.unary(plus, %[[PROMOTED]])
-// CHECK:   %[[UNPROMOTED:.*]] = cir.cast floating %[[RESULT]] : !cir.float -> !cir.f16
+// CHECK:   %[[UNPROMOTED:.*]] = cir.cast floating %[[PROMOTED]] : !cir.float -> !cir.f16
 
 // LLVM: define{{.*}} half @_Z9fp16UPlusDF16_({{.*}})
 // LLVM:   %[[F_LOAD:.*]] = load half, ptr %{{.*}}, align 2
@@ -452,7 +448,7 @@ _Float16 fp16UMinus(_Float16 f) {
 // CHECK: cir.func{{.*}} @_Z10fp16UMinusDF16_({{.*}}) -> (!cir.f16{{.*}})
 // CHECK:   %[[INPUT:.*]] = cir.load{{.*}} %[[F:.*]]
 // CHECK:   %[[PROMOTED:.*]] = cir.cast floating %[[INPUT]] : !cir.f16 -> !cir.float
-// CHECK:   %[[RESULT:.*]] = cir.unary(minus, %[[PROMOTED]])
+// CHECK:   %[[RESULT:.*]] = cir.minus %[[PROMOTED]]
 // CHECK:   %[[UNPROMOTED:.*]] = cir.cast floating %[[RESULT]] : !cir.float -> !cir.f16
 
 // LLVM: define{{.*}} half @_Z10fp16UMinusDF16_({{.*}})
@@ -483,24 +479,24 @@ void test_logical_not() {
 // CHECK: cir.func{{.*}} @_Z16test_logical_notv()
 // CHECK:   %[[A:.*]] = cir.load{{.*}} %[[A_ADDR:.*]] : !cir.ptr<!s32i>, !s32i
 // CHECK:   %[[A_BOOL:.*]] = cir.cast int_to_bool %[[A]] : !s32i -> !cir.bool
-// CHECK:   %[[A_NOT:.*]] = cir.unary(not, %[[A_BOOL]]) : !cir.bool, !cir.bool
+// CHECK:   %[[A_NOT:.*]] = cir.not %[[A_BOOL]] : !cir.bool
 // CHECK:   %[[A_CAST:.*]] = cir.cast bool_to_int %[[A_NOT]] : !cir.bool -> !s32i
 // CHECK:   cir.store{{.*}} %[[A_CAST]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
 // CHECK:   %[[B:.*]] = cir.load{{.*}} %[[B_ADDR:.*]] : !cir.ptr<!cir.bool>, !cir.bool
-// CHECK:   %[[B_NOT:.*]] = cir.unary(not, %[[B]]) : !cir.bool, !cir.bool
+// CHECK:   %[[B_NOT:.*]] = cir.not %[[B]] : !cir.bool
 // CHECK:   cir.store{{.*}} %[[B_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK:   %[[C:.*]] = cir.load{{.*}} %[[C_ADDR:.*]] : !cir.ptr<!cir.float>, !cir.float
 // CHECK:   %[[C_BOOL:.*]] = cir.cast float_to_bool %[[C]] : !cir.float -> !cir.bool
-// CHECK:   %[[C_NOT:.*]] = cir.unary(not, %[[C_BOOL]]) : !cir.bool, !cir.bool
+// CHECK:   %[[C_NOT:.*]] = cir.not %[[C_BOOL]] : !cir.bool
 // CHECK:   %[[C_CAST:.*]] = cir.cast bool_to_float %[[C_NOT]] : !cir.bool -> !cir.float
 // CHECK:   cir.store{{.*}} %[[C_CAST]], %[[C_ADDR]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK:   %[[P:.*]] = cir.load{{.*}} %[[P_ADDR:.*]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
 // CHECK:   %[[P_BOOL:.*]] = cir.cast ptr_to_bool %[[P]] : !cir.ptr<!s32i> -> !cir.bool
-// CHECK:   %[[P_NOT:.*]] = cir.unary(not, %[[P_BOOL]]) : !cir.bool, !cir.bool
+// CHECK:   %[[P_NOT:.*]] = cir.not %[[P_BOOL]] : !cir.bool
 // CHECK:   cir.store{{.*}} %[[P_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK:   %[[D:.*]] = cir.load{{.*}} %[[D_ADDR:.*]] : !cir.ptr<!cir.double>, !cir.double
 // CHECK:   %[[D_BOOL:.*]] = cir.cast float_to_bool %[[D]] : !cir.double -> !cir.bool
-// CHECK:   %[[D_NOT:.*]] = cir.unary(not, %[[D_BOOL]]) : !cir.bool, !cir.bool
+// CHECK:   %[[D_NOT:.*]] = cir.not %[[D_BOOL]] : !cir.bool
 // CHECK:   cir.store{{.*}} %[[D_NOT]], %[[B_ADDR]] : !cir.bool, !cir.ptr<!cir.bool>
 
 // LLVM: define{{.*}} void @_Z16test_logical_notv()
@@ -567,9 +563,7 @@ void f16NestedUPlus() {
 // CHECK:  %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
 // CHECK:  %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
 // CHECK:  %[[A_F32:.*]] = cir.cast floating %[[TMP_A]] : !cir.f16 -> !cir.float
-// CHECK:  %[[A_PLUS:.*]] = cir.unary(plus, %[[A_F32]]) : !cir.float, !cir.float
-// CHECK:  %[[RESULT_F32:.*]] = cir.unary(plus, %[[A_PLUS]]) : !cir.float, !cir.float
-// CHECK:  %[[RESULT:.*]] = cir.cast floating %[[RESULT_F32]] : !cir.float -> !cir.f16
+// CHECK:  %[[RESULT:.*]] = cir.cast floating %[[A_F32]] : !cir.float -> !cir.f16
 // CHECK:  cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
 // LLVM: define{{.*}} void @_Z14f16NestedUPlusv()
@@ -598,8 +592,8 @@ void f16NestedUMinus() {
 // CHECK:  %[[B_ADDR:.*]] = cir.alloca !cir.f16, !cir.ptr<!cir.f16>, ["b", init]
 // CHECK:  %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!cir.f16>, !cir.f16
 // CHECK:  %[[A_F32:.*]] = cir.cast floating %[[TMP_A]] : !cir.f16 -> !cir.float
-// CHECK:  %[[A_MINUS:.*]] = cir.unary(minus, %[[A_F32]]) : !cir.float, !cir.float
-// CHECK:  %[[RESULT_F32:.*]] = cir.unary(minus, %[[A_MINUS]]) : !cir.float, !cir.float
+// CHECK:  %[[A_MINUS:.*]] = cir.minus %[[A_F32]] : !cir.float
+// CHECK:  %[[RESULT_F32:.*]] = cir.minus %[[A_MINUS]] : !cir.float
 // CHECK:  %[[RESULT:.*]] = cir.cast floating %[[RESULT_F32]] : !cir.float -> !cir.f16
 // CHECK:  cir.store{{.*}} %[[RESULT]], %[[B_ADDR]] : !cir.f16, !cir.ptr<!cir.f16>
 
diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp
index e7d6b4974c6ae..74de78ca8a987 100644
--- a/clang/test/CIR/CodeGen/vector-ext.cpp
+++ b/clang/test/CIR/CodeGen/vector-ext.cpp
@@ -332,13 +332,12 @@ void foo8() {
 // CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP1:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
-// CIR: %[[PLUS:.*]] = cir.unary(plus, %[[TMP1]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
-// CIR: cir.store{{.*}} %[[PLUS]], %[[PLUS_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.store{{.*}} %[[TMP1]], %[[PLUS_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP2:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
-// CIR: %[[MINUS:.*]] = cir.unary(minus, %[[TMP2]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+// CIR: %[[MINUS:.*]] = cir.minus %[[TMP2]] : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[MINUS]], %[[MINUS_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP3:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
-// CIR: %[[NOT:.*]] = cir.unary(not, %[[TMP3]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+// CIR: %[[NOT:.*]] = cir.not %[[TMP3]] : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[NOT]], %[[NOT_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 
 // LLVM: %[[VEC:.*]] = alloca <4 x i32>, i64 1, align 16
diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp
index 8a804ad6fa2f1..0edef8dee8702 100644
--- a/clang/test/CIR/CodeGen/vector.cpp
+++ b/clang/test/CIR/CodeGen/vector.cpp
@@ -319,13 +319,12 @@ void foo8() {
 // CIR-SAME: #cir.int<3> : !s32i, #cir.int<4> : !s32i]> : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP1:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
-// CIR: %[[PLUS:.*]] = cir.unary(plus, %[[TMP1]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
-// CIR: cir.store{{.*}} %[[PLUS]], %[[PLUS_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
+// CIR: cir.store{{.*}} %[[TMP1]], %[[PLUS_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP2:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
-// CIR: %[[MINUS:.*]] = cir.unary(minus, %[[TMP2]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+// CIR: %[[MINUS:.*]] = cir.minus %[[TMP2]] : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[MINUS]], %[[MINUS_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 // CIR: %[[TMP3:.*]] = cir.load{{.*}} %[[VEC]] : !cir.ptr<!cir.vector<4 x !s32i>>, !cir.vector<4 x !s32i>
-// CIR: %[[NOT:.*]] = cir.unary(not, %[[TMP3]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+// CIR: %[[NOT:.*]] = cir.not %[[TMP3]] : !cir.vector<4 x !s32i>
 // CIR: cir.store{{.*}} %[[NOT]], %[[NOT_RES]] : !cir.vector<4 x !s32i>, !cir.ptr<!cir.vector<4 x !s32i>>
 
 // LLVM: %[[VEC:.*]] = alloca <4 x i32>, i64 1, align 16
diff --git a/clang/test/CIR/CodeGen/vla.c b/clang/test/CIR/CodeGen/vla.c
index 1489db643125a..4a48ec7252c0c 100644
--- a/clang/test/CIR/CodeGen/vla.c
+++ b/clang/test/CIR/CodeGen/vla.c
@@ -168,7 +168,7 @@ void f3(unsigned len) {
 // CIR:   cir.scope {
 // CIR:     cir.while {
 // CIR:     %[[CUR_I:.*]] = cir.load{{.*}} %[[I]]
-// CIR:     %[[NEXT:.*]] = cir.unary(inc, %[[CUR_I]])
+// CIR:     %[[NEXT:.*]] = cir.inc %[[CUR_I]]
 // CIR:     cir.store{{.*}} %[[NEXT]], %[[I]]
 // CIR:     %[[LEN2:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
 // CIR:     %[[CMP:.*]] = cir.cmp lt %[[NEXT]], %[[LEN2]]
@@ -359,9 +359,9 @@ void vla_subscript_expr() {
 // CIR: cir.store {{.*}} %[[A_VAL]], %[[COMPOUND_ADDR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>
 // CIR: %[[TMP_COMPOUND:.*]] = cir.load {{.*}} %[[COMPOUND_ADDR]] : !cir.ptr<!cir.ptr<!cir.ptr<!s32i>>>, !cir.ptr<!cir.ptr<!s32i>>
 // CIR: %[[COMPOUND_PTR:.*]] = cir.ptr_stride %[[TMP_COMPOUND]], %[[CONST_0]] : (!cir.ptr<!cir.ptr<!s32i>>, !s32i) -> !cir.ptr<!cir.ptr<!s32i>>
-// CIR: %[[TMP_COMPOUND:.*]] = cir.load {{.*}} %10 : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
+// CIR: %[[TMP_COMPOUND:.*]] = cir.load {{.*}} %[[COMPOUND_PTR]] : !cir.ptr<!cir.ptr<!s32i>>, !cir.ptr<!s32i>
 // CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u64i
-// CIR: %[[VLA_IDX:.*]] = cir.mul nsw %[[CONST_1]], %7 : !u64i
+// CIR: %[[VLA_IDX:.*]] = cir.mul nsw %[[CONST_1]], %[[TMP_N]] : !u64i
 // CIR: %[[VLA_A_PTR:.*]] = cir.ptr_stride %[[TMP_COMPOUND]], %[[VLA_IDX]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
 // CIR: %[[ELEM_5_PTR:.*]] = cir.ptr_stride %[[VLA_A_PTR]], %[[CONST_5]] : (!cir.ptr<!s32i>, !s32i) -> !cir.ptr<!s32i>
 // CIR: cir.store {{.*}} %[[CONST_0_VAL]], %[[ELEM_5_PTR]] : !s32i, !cir.ptr<!s32i>
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c
index f8d7d3cbf279c..969e0cec9270b 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512bw-builtins.c
@@ -209,7 +209,7 @@ __mmask32 test_kandn_mask32(__mmask32 A, __mmask32 B) {
   // CIR-LABEL: _kandn_mask32
   // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<32 x !cir.int<s, 1>>
   // CIR: cir.and {{.*}}, {{.*}} : !cir.vector<32 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
 
@@ -233,7 +233,7 @@ __mmask64 test_kandn_mask64(__mmask64 A, __mmask64 B) {
   // CIR-LABEL: _kandn_mask64
   // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<64 x !cir.int<s, 1>>
   // CIR: cir.and {{.*}}, {{.*}} : !cir.vector<64 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
 
@@ -341,7 +341,7 @@ __mmask32 test_kxnor_mask32(__mmask32 A, __mmask32 B) {
   // CIR-LABEL: _kxnor_mask32
   // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<32 x !cir.int<s, 1>>
   // CIR: cir.xor {{.*}}, {{.*}} : !cir.vector<32 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
 
@@ -366,7 +366,7 @@ __mmask64 test_kxnor_mask64(__mmask64 A, __mmask64 B) {
   // CIR-LABEL: _kxnor_mask64
   // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<64 x !cir.int<s, 1>>
   // CIR: cir.xor {{.*}}, {{.*}} : !cir.vector<64 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
 
@@ -391,7 +391,7 @@ __mmask64 test_kxnor_mask64(__mmask64 A, __mmask64 B) {
 __mmask32 test_knot_mask32(__mmask32 A) {
   // CIR-LABEL: _knot_mask32
   // CIR: cir.cast bitcast {{.*}} : !u32i -> !cir.vector<32 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<32 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<32 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<32 x !cir.int<s, 1>> -> !u32i
 
   // LLVM-LABEL: _knot_mask32
@@ -409,7 +409,7 @@ __mmask32 test_knot_mask32(__mmask32 A) {
 __mmask64 test_knot_mask64(__mmask64 A) {
   // CIR-LABEL: _knot_mask64
   // CIR: cir.cast bitcast {{.*}} : !u64i -> !cir.vector<64 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<64 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<64 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<64 x !cir.int<s, 1>> -> !u64i
 
   // LLVM-LABEL: _knot_mask64
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512dq-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512dq-builtins.c
index e96e8b8cf59a4..ce264e871c820 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512dq-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512dq-builtins.c
@@ -96,7 +96,7 @@ __mmask8 test_kandn_mask8(__mmask8 A, __mmask8 B) {
  // CIR-LABEL: _kandn_mask8
  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
- // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+ // CIR: cir.not {{.*}} : !cir.vector<8 x !cir.int<s, 1>>
  // CIR: cir.and {{.*}}, {{.*}} : !cir.vector<8 x !cir.int<s, 1>>
  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
 
@@ -163,7 +163,7 @@ __mmask8 test_kxnor_mask8(__mmask8 A, __mmask8 B) {
  // CIR-LABEL: _kxnor_mask8
  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
- // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+ // CIR: cir.not {{.*}} : !cir.vector<8 x !cir.int<s, 1>>
  // CIR: cir.xor {{.*}}, {{.*}} : !cir.vector<8 x !cir.int<s, 1>>
  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
 
@@ -187,7 +187,7 @@ __mmask8 test_kxnor_mask8(__mmask8 A, __mmask8 B) {
 __mmask8 test_knot_mask8(__mmask8 A) {
  // CIR-LABEL: _knot_mask8
  // CIR: cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<s, 1>>
- // CIR: cir.unary(not, {{.*}}) : !cir.vector<8 x !cir.int<s, 1>>
+ // CIR: cir.not {{.*}} : !cir.vector<8 x !cir.int<s, 1>>
  // CIR: cir.cast bitcast {{.*}} : !cir.vector<8 x !cir.int<s, 1>> -> !u8i
 
  // LLVM-LABEL: _knot_mask8
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
index ee47464194948..819b0d3b2e17e 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512f-builtins.c
@@ -199,7 +199,7 @@ __mmask16 test_mm512_kandn(__mmask16 A, __mmask16 B) {
   // CIR-LABEL: _mm512_kandn
   // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<16 x !cir.int<s, 1>>
   // CIR: cir.and {{.*}}, {{.*}} : !cir.vector<16 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
 
@@ -244,7 +244,7 @@ __mmask16 test_mm512_kxnor(__mmask16 A, __mmask16 B) {
   // CIR-LABEL: _mm512_kxnor
   // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<16 x !cir.int<s, 1>>
   // CIR: cir.xor {{.*}}, {{.*}} : !cir.vector<16 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
 
@@ -288,7 +288,7 @@ __mmask16 test_mm512_kxor(__mmask16 A, __mmask16 B) {
 __mmask16 test_mm512_knot(__mmask16 A) {
   // CIR-LABEL: _mm512_knot
   // CIR: cir.cast bitcast {{.*}} : !u16i -> !cir.vector<16 x !cir.int<s, 1>>
-  // CIR: cir.unary(not, {{.*}}) : !cir.vector<16 x !cir.int<s, 1>>
+  // CIR: cir.not {{.*}} : !cir.vector<16 x !cir.int<s, 1>>
   // CIR: cir.cast bitcast {{.*}} : !cir.vector<16 x !cir.int<s, 1>> -> !u16i
 
   // LLVM-LABEL: _mm512_knot
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-fcmp-sse.c b/clang/test/CIR/CodeGenBuiltins/builtin-fcmp-sse.c
index aa7f8d7c6a950..cc14f2e2aa405 100644
--- a/clang/test/CIR/CodeGenBuiltins/builtin-fcmp-sse.c
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-fcmp-sse.c
@@ -19,7 +19,7 @@ __m128 test_cmpnleps(__m128 A, __m128 B) {
   // CIR:           %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
   // CIR:           %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
   // CIR:           %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i>
-  // CIR:           %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i>
+  // CIR:           %[[UNARY_0:.*]] = cir.not %[[VEC_0]] : !cir.vector<4 x !s32i>
   // CIR:           %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float>
   // CIR:           cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>>
   // CIR:           %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float>
@@ -70,7 +70,7 @@ __m128d test_cmpnlepd(__m128d A, __m128d B) {
   // CIR:           %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double> 
   // CIR:           %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double> 
   // CIR:           %[[VEC_0:.*]] = cir.vec.cmp(le, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> 
-  // CIR:           %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> 
+  // CIR:           %[[UNARY_0:.*]] = cir.not %[[VEC_0]] : !cir.vector<2 x !s64i> 
   // CIR:           %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> 
   // CIR:           cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>> 
   // CIR:           %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double> 
@@ -121,7 +121,7 @@ __m128 test_cmpnltps(__m128 A, __m128 B) {
   // CIR:           %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float> 
   // CIR:           %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float> 
   // CIR:           %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<4 x !cir.float>, !cir.vector<4 x !s32i> 
-  // CIR:           %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<4 x !s32i>, !cir.vector<4 x !s32i> 
+  // CIR:           %[[UNARY_0:.*]] = cir.not %[[VEC_0]] : !cir.vector<4 x !s32i> 
   // CIR:           %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<4 x !s32i> -> !cir.vector<4 x !cir.float> 
   // CIR:           cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<4 x !cir.float>, !cir.ptr<!cir.vector<4 x !cir.float>> 
   // CIR:           %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<4 x !cir.float>>, !cir.vector<4 x !cir.float> 
@@ -172,7 +172,7 @@ __m128d test_cmpnltpd(__m128d A, __m128d B) {
   // CIR:           %[[LOAD_0:.*]] = cir.load align(16) %[[ALLOCA_0]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double> 
   // CIR:           %[[LOAD_1:.*]] = cir.load align(16) %[[ALLOCA_1]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double> 
   // CIR:           %[[VEC_0:.*]] = cir.vec.cmp(lt, %[[LOAD_0]], %[[LOAD_1]]) : !cir.vector<2 x !cir.double>, !cir.vector<2 x !s64i> 
-  // CIR:           %[[UNARY_0:.*]] = cir.unary(not, %[[VEC_0]]) : !cir.vector<2 x !s64i>, !cir.vector<2 x !s64i> 
+  // CIR:           %[[UNARY_0:.*]] = cir.not %[[VEC_0]] : !cir.vector<2 x !s64i> 
   // CIR:           %[[CAST_0:.*]] = cir.cast bitcast %[[UNARY_0]] : !cir.vector<2 x !s64i> -> !cir.vector<2 x !cir.double> 
   // CIR:           cir.store %[[CAST_0]], %[[ALLOCA_2]] : !cir.vector<2 x !cir.double>, !cir.ptr<!cir.vector<2 x !cir.double>> 
   // CIR:           %[[LOAD_2:.*]] = cir.load %[[ALLOCA_2]] : !cir.ptr<!cir.vector<2 x !cir.double>>, !cir.vector<2 x !cir.double> 
diff --git a/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c b/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
index dcf5fd4246481..85135ac66a5b3 100644
--- a/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
+++ b/clang/test/CIR/CodeGenBuiltins/builtin-types-compatible.c
@@ -30,7 +30,7 @@ int test_convert_bool_to_int() {
 // CIR: cir.func {{.*}} @test_convert_bool_to_int()
 // CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s32i
 // CIR:   %[[BOOL:.*]] = cir.cast int_to_bool %[[ONE]] : !s32i -> !cir.bool
-// CIR:   %[[NOT:.*]] = cir.unary(not, %[[BOOL]]) : !cir.bool, !cir.bool
+// CIR:   %[[NOT:.*]] = cir.not %[[BOOL]] : !cir.bool
 // CIR:   cir.if %[[NOT]] {
 // CIR:     %[[NEG_ONE:.*]] = cir.const #cir.int<-1> : !s32i
 // CIR:     cir.store %[[NEG_ONE]], %[[RETVAL:.*]]
diff --git a/clang/test/CIR/CodeGenOpenACC/atomic-capture.cpp b/clang/test/CIR/CodeGenOpenACC/atomic-capture.cpp
index 362b54bfcc80c..4b3759e749f1d 100644
--- a/clang/test/CIR/CodeGenOpenACC/atomic-capture.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/atomic-capture.cpp
@@ -30,7 +30,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[INC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -47,7 +47,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[INC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -66,7 +66,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[DEC:.*]] = cir.dec nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[DEC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -83,7 +83,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   // 
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[DEC:.*]] = cir.dec nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[DEC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -346,7 +346,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[INC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -367,7 +367,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[INC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -387,7 +387,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[INC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -408,7 +408,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[INC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -430,7 +430,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[DEC:.*]] = cir.dec nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[DEC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -451,7 +451,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[DEC:.*]] = cir.dec nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[DEC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -471,7 +471,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[DEC:.*]] = cir.dec nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[DEC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
@@ -492,7 +492,7 @@ void use(int x, int v, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[X_VAR]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[X_VAR_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[DEC:.*]] = cir.dec nsw %[[X_VAR_LOAD]] : !s32i
   // CHECK-NEXT: cir.store{{.*}} %[[DEC]], %[[X_VAR_ALLOC]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[X_VAR_LOAD:.*]] = cir.load{{.*}} %[[X_VAR_ALLOC]] : !cir.ptr<!s32i>, !s32i
diff --git a/clang/test/CIR/CodeGenOpenACC/atomic-update.cpp b/clang/test/CIR/CodeGenOpenACC/atomic-update.cpp
index 9ee49f36e86bf..3ef98cd757a67 100644
--- a/clang/test/CIR/CodeGenOpenACC/atomic-update.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/atomic-update.cpp
@@ -22,7 +22,7 @@ void use(int x, unsigned int y, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[TEMP_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc nsw %[[TEMP_LOAD]] : !s32i
   // CHECK-NEXT: cir.store {{.*}}%[[INC]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
   //
   // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
@@ -37,7 +37,7 @@ void use(int x, unsigned int y, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !u32i, !cir.ptr<!u32i>
   //
   // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!u32i>, !u32i
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[TEMP_LOAD]]) : !u32i, !u32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.inc %[[TEMP_LOAD]] : !u32i
   // CHECK-NEXT: cir.store {{.*}}%[[INC]], %[[TEMP_ALLOCA]] : !u32i, !cir.ptr<!u32i>
   // 
   // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!u32i>, !u32i
@@ -52,7 +52,7 @@ void use(int x, unsigned int y, float f, HasOps ops) {
   // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
   //
   // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
-  // CHECK-NEXT: %[[INC:.*]] = cir.unary(dec, %[[TEMP_LOAD]]) : !cir.float, !cir.float
+  // CHECK-NEXT: %[[INC:.*]] = cir.dec %[[TEMP_LOAD]] : !cir.float
   // CHECK-NEXT: cir.store {{.*}}%[[INC]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
   // 
   // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
index d36768d4a6a48..0a67314a91b07 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-firstprivate-clause.cpp
@@ -109,7 +109,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -145,7 +145,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -180,7 +180,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -215,7 +215,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -250,7 +250,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -285,7 +285,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -314,7 +314,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
index 19231ff44ae29..5ad6a62c72663 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-private-clause.cpp
@@ -113,7 +113,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -149,7 +149,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
index 696bbb931eee0..c3a787f32e085 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
@@ -616,7 +616,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -685,7 +685,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -793,7 +793,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -900,7 +900,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -957,7 +957,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1014,7 +1014,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1071,7 +1071,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1178,7 +1178,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1286,7 +1286,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1332,7 +1332,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1393,7 +1393,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1438,7 +1438,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1499,7 +1499,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1544,7 +1544,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1647,7 +1647,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1692,7 +1692,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1795,7 +1795,7 @@ void acc_combined() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1834,7 +1834,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1883,7 +1883,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1922,7 +1922,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1971,7 +1971,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2010,7 +2010,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2059,7 +2059,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2104,7 +2104,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2205,7 +2205,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2250,7 +2250,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2351,7 +2351,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
index 6f04d95c39e1b..c96e9d9e0b16f 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
@@ -166,7 +166,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -204,7 +204,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -249,7 +249,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -294,7 +294,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -342,7 +342,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -390,7 +390,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -422,7 +422,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -454,7 +454,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -486,7 +486,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -518,7 +518,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -550,7 +550,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -589,7 +589,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -621,7 +621,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -660,7 +660,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -692,7 +692,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -734,7 +734,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -766,7 +766,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -808,7 +808,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
index 0700bcde4e550..8ecaeddb0839a 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
@@ -376,7 +376,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -520,7 +520,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -669,7 +669,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -818,7 +818,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -961,7 +961,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1046,7 +1046,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1130,7 +1130,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1277,7 +1277,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1365,7 +1365,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1430,7 +1430,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1459,7 +1459,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1488,7 +1488,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1533,7 +1533,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1562,7 +1562,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1591,7 +1591,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1636,7 +1636,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1671,7 +1671,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1700,7 +1700,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1745,7 +1745,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1780,7 +1780,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1809,7 +1809,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1854,7 +1854,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1883,7 +1883,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1912,7 +1912,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1957,7 +1957,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1986,7 +1986,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2015,7 +2015,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2060,7 +2060,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2089,7 +2089,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2118,7 +2118,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2163,7 +2163,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2195,7 +2195,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2224,7 +2224,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2269,7 +2269,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2301,7 +2301,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2330,7 +2330,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
index 9c36ede6a129c..6ae544b9430e3 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
@@ -218,7 +218,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -256,7 +256,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -301,7 +301,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -346,7 +346,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -384,7 +384,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -422,7 +422,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -460,7 +460,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -508,7 +508,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -556,7 +556,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -588,7 +588,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -620,7 +620,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -652,7 +652,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -684,7 +684,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -716,7 +716,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -755,7 +755,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -787,7 +787,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -826,7 +826,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -858,7 +858,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -890,7 +890,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -922,7 +922,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -954,7 +954,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -986,7 +986,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1018,7 +1018,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1050,7 +1050,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1092,7 +1092,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1124,7 +1124,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1166,7 +1166,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
index 6d21c27c86280..49811106a519f 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
@@ -375,7 +375,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -519,7 +519,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -668,7 +668,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -817,7 +817,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -960,7 +960,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1045,7 +1045,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1129,7 +1129,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1276,7 +1276,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1364,7 +1364,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1429,7 +1429,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1458,7 +1458,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1487,7 +1487,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1532,7 +1532,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1561,7 +1561,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1590,7 +1590,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1635,7 +1635,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1670,7 +1670,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1699,7 +1699,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1744,7 +1744,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1779,7 +1779,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1808,7 +1808,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1853,7 +1853,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1882,7 +1882,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1911,7 +1911,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1956,7 +1956,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1985,7 +1985,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2014,7 +2014,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2059,7 +2059,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2088,7 +2088,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2117,7 +2117,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2162,7 +2162,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2194,7 +2194,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2223,7 +2223,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2268,7 +2268,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2300,7 +2300,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2329,7 +2329,7 @@ void acc_combined() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
index 444399f146884..953c894456d9a 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.c
@@ -62,7 +62,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -98,7 +98,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -133,7 +133,7 @@ struct NoCopyConstruct {};
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
index 18a934d8f5bdc..c4cec3023e046 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-firstprivate-clause.cpp
@@ -110,7 +110,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -146,7 +146,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -181,7 +181,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -216,7 +216,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -251,7 +251,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -286,7 +286,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -315,7 +315,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
index 5f3ddd87f3dc8..a2d2c39d3087a 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-private-clause.cpp
@@ -102,7 +102,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -137,7 +137,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
index 19479f723dc60..bc7ee24a35499 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
@@ -636,7 +636,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -705,7 +705,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -816,7 +816,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -927,7 +927,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -984,7 +984,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1041,7 +1041,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1098,7 +1098,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1208,7 +1208,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1319,7 +1319,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1365,7 +1365,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1426,7 +1426,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1471,7 +1471,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1532,7 +1532,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1577,7 +1577,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1684,7 +1684,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1729,7 +1729,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1836,7 +1836,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1875,7 +1875,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1924,7 +1924,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1963,7 +1963,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2012,7 +2012,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2051,7 +2051,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2100,7 +2100,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2145,7 +2145,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2249,7 +2249,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2294,7 +2294,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2398,7 +2398,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
index efc1638987dd7..dae634cd8bb58 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
@@ -616,7 +616,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -685,7 +685,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -793,7 +793,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -900,7 +900,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -957,7 +957,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1014,7 +1014,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1071,7 +1071,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1178,7 +1178,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1286,7 +1286,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1332,7 +1332,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1393,7 +1393,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1438,7 +1438,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1499,7 +1499,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1544,7 +1544,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1647,7 +1647,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1692,7 +1692,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1795,7 +1795,7 @@ void acc_compute() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1834,7 +1834,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1883,7 +1883,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1922,7 +1922,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1971,7 +1971,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2010,7 +2010,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2059,7 +2059,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2104,7 +2104,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2205,7 +2205,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2250,7 +2250,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2351,7 +2351,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
index aa46e1146f239..035c72424b68e 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
@@ -167,7 +167,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -205,7 +205,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -250,7 +250,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -295,7 +295,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -344,7 +344,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -393,7 +393,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -425,7 +425,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -457,7 +457,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -489,7 +489,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -521,7 +521,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -553,7 +553,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -592,7 +592,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -624,7 +624,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -663,7 +663,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -695,7 +695,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -738,7 +738,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -770,7 +770,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -813,7 +813,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
index 71dea0c2a42b4..3b3beb395c74b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
@@ -166,7 +166,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -204,7 +204,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -249,7 +249,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -294,7 +294,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -342,7 +342,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -390,7 +390,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -422,7 +422,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -454,7 +454,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -486,7 +486,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -518,7 +518,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -550,7 +550,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -589,7 +589,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -621,7 +621,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -660,7 +660,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -692,7 +692,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -734,7 +734,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -766,7 +766,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -808,7 +808,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
index 946d5f00b9d5c..368448eb9c47b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
@@ -376,7 +376,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -520,7 +520,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -669,7 +669,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -818,7 +818,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -961,7 +961,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1046,7 +1046,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1130,7 +1130,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1277,7 +1277,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1365,7 +1365,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1430,7 +1430,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1459,7 +1459,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1488,7 +1488,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1533,7 +1533,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1562,7 +1562,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1591,7 +1591,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1636,7 +1636,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1671,7 +1671,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1700,7 +1700,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1745,7 +1745,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1780,7 +1780,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1809,7 +1809,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1854,7 +1854,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1883,7 +1883,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1912,7 +1912,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1957,7 +1957,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1986,7 +1986,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2015,7 +2015,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2060,7 +2060,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2089,7 +2089,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2118,7 +2118,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2163,7 +2163,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2195,7 +2195,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2224,7 +2224,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2269,7 +2269,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2301,7 +2301,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2330,7 +2330,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
index eec4a1014d06a..5a892c87cbd5b 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
@@ -217,7 +217,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -255,7 +255,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -300,7 +300,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -345,7 +345,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -383,7 +383,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -421,7 +421,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -459,7 +459,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -507,7 +507,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -555,7 +555,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -587,7 +587,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -619,7 +619,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -651,7 +651,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -683,7 +683,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -715,7 +715,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -754,7 +754,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -786,7 +786,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -825,7 +825,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -857,7 +857,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -889,7 +889,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -921,7 +921,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -953,7 +953,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -985,7 +985,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1017,7 +1017,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1049,7 +1049,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1091,7 +1091,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1123,7 +1123,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1165,7 +1165,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
index 10ce53ba8b11f..770fd189fb063 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
@@ -218,7 +218,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -256,7 +256,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -301,7 +301,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -346,7 +346,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -384,7 +384,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -422,7 +422,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -460,7 +460,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -508,7 +508,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -556,7 +556,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -588,7 +588,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -620,7 +620,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -652,7 +652,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -684,7 +684,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -716,7 +716,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -755,7 +755,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -787,7 +787,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -826,7 +826,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -858,7 +858,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -890,7 +890,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -922,7 +922,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -954,7 +954,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -986,7 +986,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1018,7 +1018,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1050,7 +1050,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1092,7 +1092,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1124,7 +1124,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1166,7 +1166,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
index c53553f15e673..e32257c926d04 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
@@ -375,7 +375,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -519,7 +519,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -668,7 +668,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -817,7 +817,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -960,7 +960,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1045,7 +1045,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1129,7 +1129,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1276,7 +1276,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1364,7 +1364,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1429,7 +1429,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1458,7 +1458,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1487,7 +1487,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1532,7 +1532,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1561,7 +1561,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1590,7 +1590,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1635,7 +1635,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1670,7 +1670,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1699,7 +1699,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1744,7 +1744,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1779,7 +1779,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1808,7 +1808,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1853,7 +1853,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1882,7 +1882,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1911,7 +1911,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1956,7 +1956,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1985,7 +1985,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2014,7 +2014,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2059,7 +2059,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2088,7 +2088,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2117,7 +2117,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2162,7 +2162,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2194,7 +2194,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2223,7 +2223,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2268,7 +2268,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2300,7 +2300,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2329,7 +2329,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
index cd5ca68448dd3..60a077da4d862 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
@@ -218,7 +218,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -256,7 +256,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -301,7 +301,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -346,7 +346,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -384,7 +384,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -422,7 +422,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -460,7 +460,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -509,7 +509,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -558,7 +558,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -590,7 +590,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -622,7 +622,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -654,7 +654,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -686,7 +686,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -718,7 +718,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -757,7 +757,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -789,7 +789,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -828,7 +828,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -860,7 +860,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -892,7 +892,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -924,7 +924,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -956,7 +956,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -988,7 +988,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1020,7 +1020,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1052,7 +1052,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1095,7 +1095,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1127,7 +1127,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1170,7 +1170,7 @@ void acc_compute() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/data.c b/clang/test/CIR/CodeGenOpenACC/data.c
index 6e96af5532513..0a1a3a8f811a5 100644
--- a/clang/test/CIR/CodeGenOpenACC/data.c
+++ b/clang/test/CIR/CodeGenOpenACC/data.c
@@ -18,7 +18,7 @@ void acc_data(int cond) {
   // CHECK-NEXT: cir.const
   // CHECK-NEXT: cir.store
   // CHECK-NEXT: cir.load
-  // CHECK-NEXT: cir.unary
+  // CHECK-NEXT: cir.inc
   // CHECK-NEXT: cir.store
   // CHECK-NEXT: acc.terminator
   // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue none>}
@@ -33,7 +33,7 @@ void acc_data(int cond) {
   // CHECK-NEXT: cir.const
   // CHECK-NEXT: cir.store
   // CHECK-NEXT: cir.load
-  // CHECK-NEXT: cir.unary
+  // CHECK-NEXT: cir.inc
   // CHECK-NEXT: cir.store
   // CHECK-NEXT: acc.terminator
   // CHECK-NEXT: } attributes {defaultAttr = #acc<defaultvalue present>}
diff --git a/clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp
index efbe26dd48c50..c8eda96ed7649 100644
--- a/clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/firstprivate-clause-recipes.cpp
@@ -79,7 +79,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -87,7 +87,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -95,7 +95,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -158,7 +158,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -166,7 +166,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -174,7 +174,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -214,7 +214,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -248,7 +248,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -281,7 +281,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -346,7 +346,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -354,7 +354,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -362,7 +362,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -427,7 +427,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -435,7 +435,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -443,7 +443,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -483,7 +483,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -521,7 +521,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -584,7 +584,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -592,7 +592,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -600,7 +600,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -664,7 +664,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -672,7 +672,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -680,7 +680,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
index 585334f6e0f46..e402214c1f92b 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-private-clause.cpp
@@ -113,7 +113,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -149,7 +149,7 @@ struct HasDtor {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
index ce914466c2ab0..b112571bf1f94 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
@@ -616,7 +616,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -685,7 +685,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -793,7 +793,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -900,7 +900,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -957,7 +957,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1014,7 +1014,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1071,7 +1071,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1178,7 +1178,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1286,7 +1286,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1332,7 +1332,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1393,7 +1393,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1438,7 +1438,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1499,7 +1499,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1544,7 +1544,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1647,7 +1647,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1692,7 +1692,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1795,7 +1795,7 @@ void acc_loop() {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1834,7 +1834,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1883,7 +1883,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1922,7 +1922,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1971,7 +1971,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2010,7 +2010,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2059,7 +2059,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2104,7 +2104,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2205,7 +2205,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2250,7 +2250,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2351,7 +2351,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
index 963ec89f159e0..9eda370bafbdb 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
@@ -166,7 +166,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -204,7 +204,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -249,7 +249,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -294,7 +294,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -342,7 +342,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -390,7 +390,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -422,7 +422,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -454,7 +454,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -486,7 +486,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -518,7 +518,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -550,7 +550,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -589,7 +589,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -621,7 +621,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -660,7 +660,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -692,7 +692,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -734,7 +734,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -766,7 +766,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -808,7 +808,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
index b1d980746b98c..231b9164c9fa9 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
@@ -376,7 +376,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -520,7 +520,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -669,7 +669,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -818,7 +818,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -961,7 +961,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1046,7 +1046,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1130,7 +1130,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1277,7 +1277,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1365,7 +1365,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1430,7 +1430,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1459,7 +1459,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1488,7 +1488,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1533,7 +1533,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1562,7 +1562,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1591,7 +1591,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1636,7 +1636,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1671,7 +1671,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1700,7 +1700,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1745,7 +1745,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1780,7 +1780,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1809,7 +1809,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1854,7 +1854,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1883,7 +1883,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1912,7 +1912,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1957,7 +1957,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1986,7 +1986,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2015,7 +2015,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2060,7 +2060,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2089,7 +2089,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2118,7 +2118,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2163,7 +2163,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2195,7 +2195,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2224,7 +2224,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2269,7 +2269,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2301,7 +2301,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2330,7 +2330,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
index 2723f4e442da1..5ff18e0a6de84 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
@@ -218,7 +218,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -256,7 +256,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -301,7 +301,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -346,7 +346,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -384,7 +384,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -422,7 +422,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -460,7 +460,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -508,7 +508,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -556,7 +556,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -588,7 +588,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -620,7 +620,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -652,7 +652,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -684,7 +684,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -716,7 +716,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -755,7 +755,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -787,7 +787,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -826,7 +826,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -858,7 +858,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -890,7 +890,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -922,7 +922,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -954,7 +954,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -986,7 +986,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1018,7 +1018,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1050,7 +1050,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1092,7 +1092,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1124,7 +1124,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1166,7 +1166,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
index f7a75cfc1a8fa..e995d99a68ccf 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
@@ -375,7 +375,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -519,7 +519,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -668,7 +668,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -817,7 +817,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -960,7 +960,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1045,7 +1045,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1129,7 +1129,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1276,7 +1276,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1364,7 +1364,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !s64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1429,7 +1429,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1458,7 +1458,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1487,7 +1487,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1532,7 +1532,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1561,7 +1561,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1590,7 +1590,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1635,7 +1635,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1670,7 +1670,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1699,7 +1699,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1744,7 +1744,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1779,7 +1779,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1808,7 +1808,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1853,7 +1853,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1882,7 +1882,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1911,7 +1911,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1956,7 +1956,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1985,7 +1985,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2014,7 +2014,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2059,7 +2059,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2088,7 +2088,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2117,7 +2117,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2162,7 +2162,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2194,7 +2194,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2223,7 +2223,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2268,7 +2268,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2300,7 +2300,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -2329,7 +2329,7 @@ void acc_loop() {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
index 6b60d110f8939..fac30b077b320 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-CtorDtor.cpp
@@ -36,7 +36,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -66,7 +66,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -164,7 +164,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -172,7 +172,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -221,7 +221,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -229,7 +229,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -344,7 +344,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -352,7 +352,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -360,7 +360,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -423,7 +423,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -431,7 +431,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -439,7 +439,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -511,7 +511,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -519,7 +519,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -584,7 +584,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -592,7 +592,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
index 991259deb5051..4a167fa7ef207 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-array-recipes-NoOps.cpp
@@ -31,7 +31,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -110,7 +110,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -118,7 +118,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -212,7 +212,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -220,7 +220,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -228,7 +228,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -300,7 +300,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -308,7 +308,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
index e4d9ab7182745..b65824ef1be7e 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-CtorDtor.cpp
@@ -39,7 +39,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -65,7 +65,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -96,7 +96,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -145,7 +145,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -175,7 +175,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -216,7 +216,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -224,7 +224,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -270,7 +270,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -278,7 +278,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -328,7 +328,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -358,7 +358,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -388,7 +388,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -445,7 +445,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -453,7 +453,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -461,7 +461,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -525,7 +525,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -533,7 +533,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -541,7 +541,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -584,7 +584,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -614,7 +614,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -672,7 +672,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -714,7 +714,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -722,7 +722,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -769,7 +769,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -777,7 +777,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -828,7 +828,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -869,7 +869,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -877,7 +877,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -923,7 +923,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -931,7 +931,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -988,7 +988,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1018,7 +1018,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1074,7 +1074,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1082,7 +1082,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1090,7 +1090,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1154,7 +1154,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1162,7 +1162,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1170,7 +1170,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1220,7 +1220,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1270,7 +1270,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1300,7 +1300,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1356,7 +1356,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1364,7 +1364,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1372,7 +1372,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1436,7 +1436,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1444,7 +1444,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1452,7 +1452,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1495,7 +1495,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1524,7 +1524,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1581,7 +1581,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1589,7 +1589,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1654,7 +1654,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1662,7 +1662,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1714,7 +1714,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1752,7 +1752,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1807,7 +1807,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1815,7 +1815,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1823,7 +1823,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1887,7 +1887,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1895,7 +1895,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1903,7 +1903,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1946,7 +1946,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
index 79f142d28c4d7..d3b4205460225 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-NoOps.cpp
@@ -34,7 +34,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -60,7 +60,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -109,7 +109,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -139,7 +139,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -179,7 +179,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -187,7 +187,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -237,7 +237,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -267,7 +267,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -297,7 +297,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -353,7 +353,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -361,7 +361,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -369,7 +369,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -412,7 +412,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -442,7 +442,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -500,7 +500,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -541,7 +541,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -549,7 +549,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -602,7 +602,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -642,7 +642,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -650,7 +650,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -706,7 +706,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -736,7 +736,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -791,7 +791,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -799,7 +799,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -807,7 +807,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -857,7 +857,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -907,7 +907,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -937,7 +937,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -992,7 +992,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1000,7 +1000,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1008,7 +1008,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1051,7 +1051,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1081,7 +1081,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1137,7 +1137,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1145,7 +1145,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1197,7 +1197,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1235,7 +1235,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -1290,7 +1290,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1298,7 +1298,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1306,7 +1306,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -1349,7 +1349,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
index 64603bc7a1342..111428aaa9bb6 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-array-recipes-int.cpp
@@ -32,7 +32,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -81,7 +81,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -111,7 +111,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -161,7 +161,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -191,7 +191,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -220,7 +220,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -263,7 +263,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -293,7 +293,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -350,7 +350,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -401,7 +401,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -458,7 +458,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -488,7 +488,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -538,7 +538,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -588,7 +588,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -619,7 +619,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -662,7 +662,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -692,7 +692,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -744,7 +744,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -782,7 +782,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -826,7 +826,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
index e96020d67da90..a5b5ae605e36b 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-CtorDtor.cpp
@@ -51,7 +51,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -91,7 +91,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -124,7 +124,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -169,7 +169,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -203,7 +203,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -236,7 +236,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -292,7 +292,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -300,7 +300,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -308,7 +308,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -373,7 +373,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -381,7 +381,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -389,7 +389,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -444,7 +444,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -483,7 +483,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -516,7 +516,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -560,7 +560,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -568,7 +568,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -617,7 +617,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -625,7 +625,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -679,7 +679,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -705,7 +705,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -735,7 +735,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
index 3f3dad7323a13..a669adc312caa 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-NoOps.cpp
@@ -45,7 +45,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -84,7 +84,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -118,7 +118,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -161,7 +161,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -195,7 +195,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -229,7 +229,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -286,7 +286,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -294,7 +294,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -302,7 +302,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -359,7 +359,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -398,7 +398,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -431,7 +431,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -475,7 +475,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -483,7 +483,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -537,7 +537,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -563,7 +563,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
index 89d58fa49e46c..e75f2beae437d 100644
--- a/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/private-clause-pointer-recipes-int.cpp
@@ -43,7 +43,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -83,7 +83,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -115,7 +115,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -159,7 +159,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -193,7 +193,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -225,7 +225,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -282,7 +282,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -322,7 +322,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -355,7 +355,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -410,7 +410,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
index 95fc0bdb3e0f0..fa92b6c0c230a 100644
--- a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
@@ -67,7 +67,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -75,7 +75,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -83,7 +83,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -151,7 +151,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -159,7 +159,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -167,7 +167,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -230,7 +230,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -238,7 +238,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -246,7 +246,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -287,7 +287,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -321,7 +321,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -354,7 +354,7 @@ void do_things(unsigned A, unsigned B) {
 //
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -412,7 +412,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -420,7 +420,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -428,7 +428,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -498,7 +498,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -506,7 +506,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -514,7 +514,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -578,7 +578,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -586,7 +586,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -594,7 +594,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -635,7 +635,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield 
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -673,7 +673,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i 
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR_LOAD]] : !u64i 
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i> 
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } 
@@ -730,7 +730,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -738,7 +738,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -746,7 +746,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -822,7 +822,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -830,7 +830,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -838,7 +838,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.inc %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -902,7 +902,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR1_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -910,7 +910,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR2_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
@@ -918,7 +918,7 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: } step {
 // CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
-// CHECK-NEXT: %[[DEC:.*]] = cir.unary(dec, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: %[[DEC:.*]] = cir.dec %[[ITR3_LOAD]] : !u64i
 // CHECK-NEXT: cir.store %[[DEC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
 // CHECK-NEXT: cir.yield
 // CHECK-NEXT: }
diff --git a/clang/test/CIR/IR/unary.cir b/clang/test/CIR/IR/unary.cir
index d01d4eb3c920a..1ef1606807935 100644
--- a/clang/test/CIR/IR/unary.cir
+++ b/clang/test/CIR/IR/unary.cir
@@ -9,42 +9,38 @@ module {
   cir.func @test_unary_unsigned() {
     %0 = cir.alloca !u32i, !cir.ptr<!u32i>, ["a"] {alignment = 4 : i64}
     %1 = cir.load %0 : !cir.ptr<!u32i>, !u32i
-    %2 = cir.unary(plus, %1) : !u32i, !u32i
-    %3 = cir.unary(minus, %1) : !u32i, !u32i
-    %4 = cir.unary(not, %1) : !u32i, !u32i
-    %5 = cir.unary(inc, %1) : !u32i, !u32i
-    %6 = cir.unary(dec, %1) : !u32i, !u32i
+    %2 = cir.minus %1 : !u32i
+    %3 = cir.not %1 : !u32i
+    %4 = cir.inc %1 : !u32i
+    %5 = cir.dec %1 : !u32i
     cir.return
   }
 // CHECK: cir.func{{.*}} @test_unary_unsigned() {
 // CHECK:   %0 = cir.alloca !u32i, !cir.ptr<!u32i>, ["a"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.load %0 : !cir.ptr<!u32i>, !u32i
-// CHECK:   %2 = cir.unary(plus, %1) : !u32i, !u32i
-// CHECK:   %3 = cir.unary(minus, %1) : !u32i, !u32i
-// CHECK:   %4 = cir.unary(not, %1) : !u32i, !u32i
-// CHECK:   %5 = cir.unary(inc, %1) : !u32i, !u32i
-// CHECK:   %6 = cir.unary(dec, %1) : !u32i, !u32i
+// CHECK:   %2 = cir.minus %1 : !u32i
+// CHECK:   %3 = cir.not %1 : !u32i
+// CHECK:   %4 = cir.inc %1 : !u32i
+// CHECK:   %5 = cir.dec %1 : !u32i
 // CHECK:   cir.return
 // CHECK: }
 
   cir.func @test_unary_signed() {
     %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["a"] {alignment = 4 : i64}
     %1 = cir.load %0 : !cir.ptr<!s32i>, !s32i
-    %2 = cir.unary(plus, %1) : !s32i, !s32i
-    %3 = cir.unary(minus, %1) nsw : !s32i, !s32i
-    %4 = cir.unary(not, %1) : !s32i, !s32i
-    %5 = cir.unary(inc, %1) nsw : !s32i, !s32i
-    %6 = cir.unary(dec, %1) nsw : !s32i, !s32i
+    %2 = cir.minus nsw %1 : !s32i
+    %3 = cir.not %1 : !s32i
+    %4 = cir.inc nsw %1 : !s32i
+    %5 = cir.dec nsw %1 : !s32i
     cir.return
   }
 // CHECK: cir.func{{.*}} @test_unary_signed() {
 // CHECK:   %0 = cir.alloca !s32i, !cir.ptr<!s32i>, ["a"] {alignment = 4 : i64}
 // CHECK:   %1 = cir.load %0 : !cir.ptr<!s32i>, !s32i
-// CHECK:   %2 = cir.unary(plus, %1) : !s32i, !s32i
-// CHECK:   %3 = cir.unary(minus, %1) nsw : !s32i, !s32i
-// CHECK:   %4 = cir.unary(not, %1) : !s32i, !s32i
-// CHECK:   %5 = cir.unary(inc, %1) nsw : !s32i, !s32i
-// CHECK:   %6 = cir.unary(dec, %1) nsw : !s32i, !s32i
+// CHECK:   %2 = cir.minus nsw %1 : !s32i
+// CHECK:   %3 = cir.not %1 : !s32i
+// CHECK:   %4 = cir.inc nsw %1 : !s32i
+// CHECK:   %5 = cir.dec nsw %1 : !s32i
 // CHECK:   cir.return
 // CHECK: }
 }
diff --git a/clang/test/CIR/Lowering/binop-int-vector.cir b/clang/test/CIR/Lowering/binop-int-vector.cir
new file mode 100644
index 0000000000000..d8814669b1fb2
--- /dev/null
+++ b/clang/test/CIR/Lowering/binop-int-vector.cir
@@ -0,0 +1,24 @@
+// RUN: cir-opt %s -cir-to-llvm -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+!u32i = !cir.int<u, 32>
+
+module {
+  cir.func @signed_vec(%arg0 : !cir.vector<4 x !s32i>,
+                       %arg1 : !cir.vector<4 x !s32i>) {
+    %0 = cir.max %arg0, %arg1 : !cir.vector<4 x !s32i>
+    // CHECK: = llvm.intr.smax
+    %1 = cir.min %arg0, %arg1 : !cir.vector<4 x !s32i>
+    // CHECK: = llvm.intr.smin
+    cir.return
+  }
+
+  cir.func @unsigned_vec(%arg0 : !cir.vector<4 x !u32i>,
+                         %arg1 : !cir.vector<4 x !u32i>) {
+    %0 = cir.max %arg0, %arg1 : !cir.vector<4 x !u32i>
+    // CHECK: = llvm.intr.umax
+    %1 = cir.min %arg0, %arg1 : !cir.vector<4 x !u32i>
+    // CHECK: = llvm.intr.umin
+    cir.return
+  }
+}
diff --git a/clang/test/CIR/Lowering/binop-signed-int.cir b/clang/test/CIR/Lowering/binop-signed-int.cir
index 091becbd78c2c..15bee750e2d7c 100644
--- a/clang/test/CIR/Lowering/binop-signed-int.cir
+++ b/clang/test/CIR/Lowering/binop-signed-int.cir
@@ -55,6 +55,8 @@ module {
     cir.store %34, %2 : !s32i, !cir.ptr<!s32i>
     %37 = cir.max %32, %33 : !s32i
     // CHECK: = llvm.intr.smax
+    %38 = cir.min %32, %33 : !s32i
+    // CHECK: = llvm.intr.smin
     cir.return
   }
 }
diff --git a/clang/test/CIR/Lowering/binop-unsigned-int.cir b/clang/test/CIR/Lowering/binop-unsigned-int.cir
index 79d8f6bdcecd3..d5545cb7769d4 100644
--- a/clang/test/CIR/Lowering/binop-unsigned-int.cir
+++ b/clang/test/CIR/Lowering/binop-unsigned-int.cir
@@ -44,6 +44,7 @@ module {
     %35 = cir.add sat %32, %33: !u32i
     %36 = cir.sub sat %32, %33: !u32i  
     %37 = cir.max %32, %33 : !u32i
+    %38 = cir.min %32, %33 : !u32i
     cir.return
   }
 }
@@ -59,6 +60,7 @@ module {
 // MLIR: = llvm.intr.uadd.sat{{.*}}(i32, i32) -> i32
 // MLIR: = llvm.intr.usub.sat{{.*}}(i32, i32) -> i32 
 // MLIR: = llvm.intr.umax
+// MLIR: = llvm.intr.umin
 
 // LLVM: = mul i32
 // LLVM: = udiv i32
@@ -71,3 +73,4 @@ module {
 // LLVM: = call i32 @llvm.uadd.sat.i32
 // LLVM: = call i32 @llvm.usub.sat.i32
 // LLVM: = call i32 @llvm.umax.i32
+// LLVM: = call i32 @llvm.umin.i32
diff --git a/clang/test/CIR/Transforms/binop-traits.cir b/clang/test/CIR/Transforms/binop-traits.cir
new file mode 100644
index 0000000000000..b82f9b47d433c
--- /dev/null
+++ b/clang/test/CIR/Transforms/binop-traits.cir
@@ -0,0 +1,49 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+
+// CHECK-LABEL: @and_idempotent
+// CHECK-NEXT: cir.return %arg0
+cir.func @and_idempotent(%arg0 : !s32i) -> !s32i {
+  %0 = cir.and %arg0, %arg0 : !s32i
+  cir.return %0 : !s32i
+}
+
+// CHECK-LABEL: @or_idempotent
+// CHECK-NEXT: cir.return %arg0
+cir.func @or_idempotent(%arg0 : !s32i) -> !s32i {
+  %0 = cir.or %arg0, %arg0 : !s32i
+  cir.return %0 : !s32i
+}
+
+// CHECK-LABEL: @and_commutative
+// CHECK: cir.and %arg0, %{{.*}} : !s32i
+cir.func @and_commutative(%arg0 : !s32i) -> !s32i {
+  %0 = cir.const #cir.int<42> : !s32i
+  %1 = cir.and %0, %arg0 : !s32i
+  cir.return %1 : !s32i
+}
+
+// CHECK-LABEL: @or_commutative
+// CHECK: cir.or %arg0, %{{.*}} : !s32i
+cir.func @or_commutative(%arg0 : !s32i) -> !s32i {
+  %0 = cir.const #cir.int<42> : !s32i
+  %1 = cir.or %0, %arg0 : !s32i
+  cir.return %1 : !s32i
+}
+
+// CHECK-LABEL: @xor_commutative
+// CHECK: cir.xor %arg0, %{{.*}} : !s32i
+cir.func @xor_commutative(%arg0 : !s32i) -> !s32i {
+  %0 = cir.const #cir.int<42> : !s32i
+  %1 = cir.xor %0, %arg0 : !s32i
+  cir.return %1 : !s32i
+}
+
+// CHECK-LABEL: @max_commutative
+// CHECK: cir.max %arg0, %{{.*}} : !s32i
+cir.func @max_commutative(%arg0 : !s32i) -> !s32i {
+  %0 = cir.const #cir.int<42> : !s32i
+  %1 = cir.max %0, %arg0 : !s32i
+  cir.return %1 : !s32i
+}
diff --git a/clang/test/CIR/Transforms/canonicalize.cir b/clang/test/CIR/Transforms/canonicalize.cir
index cfac73ecdb738..26493f7a37e39 100644
--- a/clang/test/CIR/Transforms/canonicalize.cir
+++ b/clang/test/CIR/Transforms/canonicalize.cir
@@ -64,8 +64,8 @@ module {
   // CHECK-NEXT: }
 
   cir.func @unary_not_not(%arg0: !cir.bool) -> !cir.bool {
-    %0 = cir.unary(not, %arg0) : !cir.bool, !cir.bool
-    %1 = cir.unary(not, %0) : !cir.bool, !cir.bool
+    %0 = cir.not %arg0 : !cir.bool
+    %1 = cir.not %0 : !cir.bool
     cir.return %1 : !cir.bool
   }
   // CHECK:      cir.func{{.*}} @unary_not_not(%arg0: !cir.bool) -> !cir.bool
@@ -73,7 +73,7 @@ module {
 
   cir.func @unary_poison() -> !s32i {
     %0 = cir.const #cir.poison : !s32i
-    %1 = cir.unary(inc, %0) : !s32i, !s32i
+    %1 = cir.inc %0 : !s32i
     cir.return %1 : !s32i
   }
   // CHECK:      @unary_poison
@@ -83,7 +83,7 @@ module {
 
   cir.func @unary_not_true() -> !cir.bool {
     %0 = cir.const #true
-    %1 = cir.unary(not, %0) : !cir.bool, !cir.bool
+    %1 = cir.not %0 : !cir.bool
     cir.return %1 : !cir.bool
   }
   // CHECK:      cir.func{{.*}} @unary_not_true() -> !cir.bool
@@ -92,7 +92,7 @@ module {
 
   cir.func @unary_not_false() -> !cir.bool {
     %0 = cir.const #false
-    %1 = cir.unary(not, %0) : !cir.bool, !cir.bool
+    %1 = cir.not %0 : !cir.bool
     cir.return %1 : !cir.bool
   }
   // CHECK:      cir.func{{.*}} @unary_not_false() -> !cir.bool
@@ -101,7 +101,7 @@ module {
 
   cir.func @unary_not_int() -> !s32i {
     %0 = cir.const #cir.int<1> : !s32i
-    %1 = cir.unary(not, %0) : !s32i, !s32i
+    %1 = cir.not %0 : !s32i
     cir.return %1 : !s32i
   }
   // CHECK:      cir.func{{.*}} @unary_not_int() -> !s32i
@@ -110,88 +110,16 @@ module {
 
   cir.func @unary_not_uint() -> !u32i {
     %0 = cir.const #cir.int<1> : !u32i
-    %1 = cir.unary(not, %0) : !u32i, !u32i
+    %1 = cir.not %0 : !u32i
     cir.return %1 : !u32i
   }
   // CHECK:      cir.func{{.*}} @unary_not_uint() -> !u32i
   // CHECK-NEXT:   %[[CONST:.*]] = cir.const #cir.int<4294967294> : !u32i
   // CHECK-NEXT:   cir.return %[[CONST]] : !u32i
 
-  cir.func @unary_plus_true() -> !cir.bool {
-    %0 = cir.const #true
-    %1 = cir.unary(plus, %0) : !cir.bool, !cir.bool
-    cir.return %1 : !cir.bool
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_true() -> !cir.bool
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #true
-  // CHECK-NEXT:   cir.return %[[CONST]] : !cir.bool
-
-  cir.func @unary_plus_false() -> !cir.bool {
-    %0 = cir.const #false
-    %1 = cir.unary(plus, %0) : !cir.bool, !cir.bool
-    cir.return %1 : !cir.bool
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_false() -> !cir.bool
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #false
-  // CHECK-NEXT:   cir.return %[[CONST]] : !cir.bool
-
-  cir.func @unary_plus_int() -> !s32i {
-    %0 = cir.const #cir.int<1> : !s32i
-    %1 = cir.unary(plus, %0) : !s32i, !s32i
-    cir.return %1 : !s32i
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_int() -> !s32i
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #cir.int<1> : !s32i
-  // CHECK-NEXT:   cir.return %[[CONST]] : !s32i
-
-  cir.func @unary_plus_uint() -> !u32i {
-    %0 = cir.const #cir.int<1> : !u32i
-    %1 = cir.unary(plus, %0) : !u32i, !u32i
-    cir.return %1 : !u32i
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_uint() -> !u32i
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #cir.int<1> : !u32i
-  // CHECK-NEXT:   cir.return %[[CONST]] : !u32i
-
-  cir.func @unary_plus_float() -> !cir.float {
-    %0 = cir.const #cir.fp<1.100000e+00> : !cir.float
-    %1 = cir.unary(plus, %0) : !cir.float, !cir.float
-    cir.return %1 : !cir.float
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_float() -> !cir.float
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #cir.fp<1.100000e+00> : !cir.float
-  // CHECK-NEXT:   cir.return %[[CONST]] : !cir.float
-
-  cir.func @unary_plus_double() -> !cir.double {
-    %0 = cir.const #cir.fp<1.100000e+00> : !cir.double
-    %1 = cir.unary(plus, %0) : !cir.double, !cir.double
-    cir.return %1 : !cir.double
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_double() -> !cir.double
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #cir.fp<1.100000e+00> : !cir.double
-  // CHECK-NEXT:   cir.return %[[CONST]] : !cir.double
-
-  cir.func @unary_plus_nan() -> !cir.float {
-    %0 = cir.const #cir.fp<0x7F800000> : !cir.float
-    %1 = cir.unary(plus, %0) : !cir.float, !cir.float
-    cir.return %1 : !cir.float
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_nan() -> !cir.float
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #cir.fp<0x7F800000> : !cir.float
-  // CHECK-NEXT:   cir.return %[[CONST]] : !cir.float
-
-  cir.func @unary_plus_neg_nan() -> !cir.float {
-    %0 = cir.const #cir.fp<0xFF800000> : !cir.float
-    %1 = cir.unary(plus, %0) : !cir.float, !cir.float
-    cir.return %1 : !cir.float
-  }
-  // CHECK:      cir.func{{.*}} @unary_plus_neg_nan() -> !cir.float
-  // CHECK-NEXT:   %[[CONST:.*]] = cir.const #cir.fp<0xFF800000> : !cir.float
-  // CHECK-NEXT:   cir.return %[[CONST]] : !cir.float
-
   cir.func @unary_minus_true() -> !cir.bool {
     %0 = cir.const #true
-    %1 = cir.unary(minus, %0) : !cir.bool, !cir.bool
+    %1 = cir.minus %0 : !cir.bool
     cir.return %1 : !cir.bool
   }
   // CHECK:      cir.func{{.*}} @unary_minus_true() -> !cir.bool
@@ -200,7 +128,7 @@ module {
 
   cir.func @unary_minus_false() -> !cir.bool {
     %0 = cir.const #false
-    %1 = cir.unary(minus, %0) : !cir.bool, !cir.bool
+    %1 = cir.minus %0 : !cir.bool
     cir.return %1 : !cir.bool
   }
   // CHECK:      cir.func{{.*}} @unary_minus_false() -> !cir.bool
@@ -209,7 +137,7 @@ module {
 
   cir.func @unary_minus_int() -> !s32i {
     %0 = cir.const #cir.int<1> : !s32i
-    %1 = cir.unary(minus, %0) : !s32i, !s32i
+    %1 = cir.minus %0 : !s32i
     cir.return %1 : !s32i
   }
   // CHECK:      cir.func{{.*}} @unary_minus_int() -> !s32i
@@ -218,7 +146,7 @@ module {
 
   cir.func @unary_minus_uint() -> !u32i {
     %0 = cir.const #cir.int<1> : !u32i
-    %1 = cir.unary(minus, %0) : !u32i, !u32i
+    %1 = cir.minus %0 : !u32i
     cir.return %1 : !u32i
   }
   // CHECK:      cir.func{{.*}} @unary_minus_uint() -> !u32i
@@ -227,7 +155,7 @@ module {
 
   cir.func @unary_minus_float() -> !cir.float {
     %0 = cir.const #cir.fp<1.100000e+00> : !cir.float
-    %1 = cir.unary(minus, %0) : !cir.float, !cir.float
+    %1 = cir.minus %0 : !cir.float
     cir.return %1 : !cir.float
   }
   // CHECK:      cir.func{{.*}} @unary_minus_float() -> !cir.float
@@ -236,7 +164,7 @@ module {
 
   cir.func @unary_minus_double() -> !cir.double {
     %0 = cir.const #cir.fp<1.100000e+00> : !cir.double
-    %1 = cir.unary(minus, %0) : !cir.double, !cir.double
+    %1 = cir.minus %0 : !cir.double
     cir.return %1 : !cir.double
   }
   // CHECK:      cir.func{{.*}} @unary_minus_double() -> !cir.double
@@ -245,7 +173,7 @@ module {
 
   cir.func @unary_minus_nan() -> !cir.float {
     %0 = cir.const #cir.fp<0x7F800000> : !cir.float
-    %1 = cir.unary(minus, %0) : !cir.float, !cir.float
+    %1 = cir.minus %0 : !cir.float
     cir.return %1 : !cir.float
   }
   // CHECK:      cir.func{{.*}} @unary_minus_nan() -> !cir.float
@@ -254,7 +182,7 @@ module {
 
   cir.func @unary_minus_neg_nan() -> !cir.float {
     %0 = cir.const #cir.fp<0xFF800000> : !cir.float
-    %1 = cir.unary(minus, %0) : !cir.float, !cir.float
+    %1 = cir.minus %0 : !cir.float
     cir.return %1 : !cir.float
   }
   // CHECK:      cir.func{{.*}} @unary_minus_neg_nan() -> !cir.float
diff --git a/clang/test/CIR/Transforms/flatten-cleanup-scope-eh.cir b/clang/test/CIR/Transforms/flatten-cleanup-scope-eh.cir
index 35852eacebf34..9cd1170b4295d 100644
--- a/clang/test/CIR/Transforms/flatten-cleanup-scope-eh.cir
+++ b/clang/test/CIR/Transforms/flatten-cleanup-scope-eh.cir
@@ -247,7 +247,7 @@ cir.func @test_nrvo() -> !rec_NonTrivial {
     cir.yield
   } cleanup all {
     %4 = cir.load align(1) %1 : !cir.ptr<!cir.bool>, !cir.bool
-    %5 = cir.unary(not, %4) : !cir.bool, !cir.bool
+    %5 = cir.not %4 : !cir.bool
     cir.if %5 {
       cir.call @_ZN10NonTrivialD1Ev(%0) nothrow : (!cir.ptr<!rec_NonTrivial>) -> ()
     }
@@ -277,7 +277,7 @@ cir.func @test_nrvo() -> !rec_NonTrivial {
 // Normal cleanup: inlined cleanup region with flattened cir.if.
 // CHECK:       ^[[NORMAL_CLEANUP]]:
 // CHECK:         %[[N_FLAG:.*]] = cir.load align(1) %[[NRVO]] : !cir.ptr<!cir.bool>, !cir.bool
-// CHECK:         %[[N_NOT:.*]] = cir.unary(not, %[[N_FLAG]]) : !cir.bool, !cir.bool
+// CHECK:         %[[N_NOT:.*]] = cir.not %[[N_FLAG]] : !cir.bool
 // CHECK:         cir.brcond %[[N_NOT]] ^[[N_IF_TRUE:bb[0-9]+]], ^[[N_MERGE:bb[0-9]+]]
 //
 // CHECK:       ^[[N_IF_TRUE]]:
@@ -299,7 +299,7 @@ cir.func @test_nrvo() -> !rec_NonTrivial {
 // CHECK:       ^[[EH_CLEANUP]](%[[ET:.*]]: !cir.eh_token):
 // CHECK:         %[[CT:.*]] = cir.begin_cleanup %[[ET]] : !cir.eh_token -> !cir.cleanup_token
 // CHECK:         %[[EH_FLAG:.*]] = cir.load align(1) %[[NRVO]] : !cir.ptr<!cir.bool>, !cir.bool
-// CHECK:         %[[EH_NOT:.*]] = cir.unary(not, %[[EH_FLAG]]) : !cir.bool, !cir.bool
+// CHECK:         %[[EH_NOT:.*]] = cir.not %[[EH_FLAG]] : !cir.bool
 // CHECK:         cir.brcond %[[EH_NOT]] ^[[EH_IF_TRUE:bb[0-9]+]], ^[[EH_MERGE:bb[0-9]+]]
 //
 // EH cleanup if-true: call destructor.
diff --git a/clang/test/CIR/Transforms/flatten-cleanup-scope-multi-exit.cir b/clang/test/CIR/Transforms/flatten-cleanup-scope-multi-exit.cir
index 7a7b51005c2d3..490c287ab9696 100644
--- a/clang/test/CIR/Transforms/flatten-cleanup-scope-multi-exit.cir
+++ b/clang/test/CIR/Transforms/flatten-cleanup-scope-multi-exit.cir
@@ -501,7 +501,7 @@ cir.func @test_return_computed_value(%arg0: !s32i) -> !s32i {
       %cond = cir.call @shouldReturn() : () -> !cir.bool
       cir.brcond %cond ^bb_return, ^bb_normal
     ^bb_return:
-      %retval = cir.unary(not, %arg0) : !s32i, !s32i
+      %retval = cir.not %arg0 : !s32i
       cir.return %retval : !s32i
     ^bb_normal:
       cir.yield
@@ -533,7 +533,7 @@ cir.func @test_return_computed_value(%arg0: !s32i) -> !s32i {
 // CHECK:         %[[COND:.*]] = cir.call @shouldReturn()
 // CHECK:         cir.brcond %[[COND]] ^[[RET_PATH:bb[0-9]+]], ^[[YIELD_PATH:bb[0-9]+]]
 // CHECK:       ^[[RET_PATH]]:
-// CHECK:         %[[COMPUTED:.*]] = cir.unary(not, %[[ARG0]]) : !s32i, !s32i
+// CHECK:         %[[COMPUTED:.*]] = cir.not %[[ARG0]] : !s32i
 // CHECK:         cir.store %[[COMPUTED]], %[[RET_TMP]]
 // CHECK:         %[[RET_ID:.*]] = cir.const #cir.int<1> : !s32i
 // CHECK:         cir.store %[[RET_ID]], %[[DEST_SLOT]]
diff --git a/clang/test/CIR/Transforms/max-min-idempotent.cir b/clang/test/CIR/Transforms/max-min-idempotent.cir
new file mode 100644
index 0000000000000..1ce4ced3f745d
--- /dev/null
+++ b/clang/test/CIR/Transforms/max-min-idempotent.cir
@@ -0,0 +1,77 @@
+// RUN: cir-opt %s -cir-canonicalize -o - | FileCheck %s
+
+!s32i = !cir.int<s, 32>
+!u32i = !cir.int<u, 32>
+
+// Idempotent: max(x, x) -> x
+// CHECK-LABEL: cir.func @max_idempotent
+// CHECK-NEXT:    cir.return %arg0
+cir.func @max_idempotent(%arg0 : !s32i) -> !s32i {
+  %0 = cir.max %arg0, %arg0 : !s32i
+  cir.return %0 : !s32i
+}
+
+// Idempotent: min(x, x) -> x
+// CHECK-LABEL: cir.func @min_idempotent
+// CHECK-NEXT:    cir.return %arg0
+cir.func @min_idempotent(%arg0 : !s32i) -> !s32i {
+  %0 = cir.min %arg0, %arg0 : !s32i
+  cir.return %0 : !s32i
+}
+
+// Idempotent: max(x, x) -> x (unsigned)
+// CHECK-LABEL: cir.func @max_idempotent_unsigned
+// CHECK-NEXT:    cir.return %arg0
+cir.func @max_idempotent_unsigned(%arg0 : !u32i) -> !u32i {
+  %0 = cir.max %arg0, %arg0 : !u32i
+  cir.return %0 : !u32i
+}
+
+// Idempotent: min(x, x) -> x (unsigned)
+// CHECK-LABEL: cir.func @min_idempotent_unsigned
+// CHECK-NEXT:    cir.return %arg0
+cir.func @min_idempotent_unsigned(%arg0 : !u32i) -> !u32i {
+  %0 = cir.min %arg0, %arg0 : !u32i
+  cir.return %0 : !u32i
+}
+
+// Commutative: max(const, x) -> max(x, const)
+// CHECK-LABEL: cir.func @max_commutative
+// CHECK:         %[[C:.*]] = cir.const #cir.int<42> : !s32i
+// CHECK-NEXT:    %[[R:.*]] = cir.max %arg0, %[[C]] : !s32i
+// CHECK-NEXT:    cir.return %[[R]]
+cir.func @max_commutative(%arg0 : !s32i) -> !s32i {
+  %0 = cir.const #cir.int<42> : !s32i
+  %1 = cir.max %0, %arg0 : !s32i
+  cir.return %1 : !s32i
+}
+
+// Commutative: min(const, x) -> min(x, const)
+// CHECK-LABEL: cir.func @min_commutative
+// CHECK:         %[[C:.*]] = cir.const #cir.int<42> : !s32i
+// CHECK-NEXT:    %[[R:.*]] = cir.min %arg0, %[[C]] : !s32i
+// CHECK-NEXT:    cir.return %[[R]]
+cir.func @min_commutative(%arg0 : !s32i) -> !s32i {
+  %0 = cir.const #cir.int<42> : !s32i
+  %1 = cir.min %0, %arg0 : !s32i
+  cir.return %1 : !s32i
+}
+
+// Idempotent chained: max(max(x, y), max(x, y)) -> max(x, y)
+// CHECK-LABEL: cir.func @max_idempotent_chained
+// CHECK-NEXT:    %[[M:.*]] = cir.max %arg0, %arg1 : !s32i
+// CHECK-NEXT:    cir.return %[[M]]
+cir.func @max_idempotent_chained(%arg0 : !s32i, %arg1 : !s32i) -> !s32i {
+  %0 = cir.max %arg0, %arg1 : !s32i
+  %1 = cir.max %0, %0 : !s32i
+  cir.return %1 : !s32i
+}
+
+// No fold: distinct operands should remain unchanged
+// CHECK-LABEL: cir.func @max_no_fold
+// CHECK-NEXT:    %[[R:.*]] = cir.max %arg0, %arg1 : !s32i
+// CHECK-NEXT:    cir.return %[[R]]
+cir.func @max_no_fold(%arg0 : !s32i, %arg1 : !s32i) -> !s32i {
+  %0 = cir.max %arg0, %arg1 : !s32i
+  cir.return %0 : !s32i
+}
diff --git a/clang/test/CIR/Transforms/select.cir b/clang/test/CIR/Transforms/select.cir
index 0ad5c43178831..413b6cc668a7c 100644
--- a/clang/test/CIR/Transforms/select.cir
+++ b/clang/test/CIR/Transforms/select.cir
@@ -54,7 +54,7 @@ module {
   }
 
   //      CHECK: cir.func{{.*}} @simplify_2(%[[ARG0:.+]]: !cir.bool) -> !cir.bool {
-  // CHECK-NEXT:   %[[#A:]] = cir.unary(not, %[[ARG0]]) : !cir.bool, !cir.bool
+  // CHECK-NEXT:   %[[#A:]] = cir.not %[[ARG0]] : !cir.bool
   // CHECK-NEXT:   cir.return %[[#A]] : !cir.bool
   // CHECK-NEXT: }
 }
diff --git a/clang/test/CMakeLists.txt b/clang/test/CMakeLists.txt
index f6244c938fc90..db12d4ee38fe4 100644
--- a/clang/test/CMakeLists.txt
+++ b/clang/test/CMakeLists.txt
@@ -12,6 +12,7 @@ llvm_canonicalize_cmake_booleans(
   CLANG_ENABLE_CIR
   CLANG_ENABLE_OBJC_REWRITER
   CLANG_LINK_CLANG_DYLIB
+  CLANG_USE_XCSELECT
   ENABLE_BACKTRACES
   LLVM_BYE_LINK_INTO_TOOLS
   LLVM_ENABLE_PLUGINS
diff --git a/clang/test/CXX/drs/cwg8xx.cpp b/clang/test/CXX/drs/cwg8xx.cpp
index 7395f04c8e399..c76417fddbf25 100644
--- a/clang/test/CXX/drs/cwg8xx.cpp
+++ b/clang/test/CXX/drs/cwg8xx.cpp
@@ -23,10 +23,10 @@ template <> void f(int &&) = delete; // #cwg873-rvalue-ref
 void g(int i) {
   f(i); // calls f<int&>(int&)
   // since-cxx11-error at -1 {{call to deleted function 'f'}}
-  //   since-cxx11-note@#cwg873-lvalue-ref {{candidate function [with T = int &] has been implicitly deleted}}
+  //   since-cxx11-note@#cwg873-lvalue-ref {{candidate function [with T = int &] has been explicitly deleted}}
   f(0); // calls f<int>(int&&)
   // since-cxx11-error at -1 {{call to deleted function 'f'}}
-  //   since-cxx11-note@#cwg873-rvalue-ref {{candidate function [with T = int] has been implicitly deleted}}
+  //   since-cxx11-note@#cwg873-rvalue-ref {{candidate function [with T = int] has been explicitly deleted}}
 }
 #endif
 } // namespace cwg873
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
index 70a96bed05867..7b58150eaaf84 100644
--- a/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.req/nested-requirement.cpp
@@ -152,7 +152,7 @@ void func() {
   // expected-note@#bar {{while substituting template arguments into constraint expression here}}
   // expected-note@#bar {{while checking the satisfaction of nested requirement requested here}}
   // expected-note@#bar {{candidate template ignored: constraints not satisfied [with T = False]}}
-  // expected-note@#bar {{because 'X<False>::value' evaluated to false}}
+  // expected-note@#bar {{because 'X<SubstitutionFailureNestedRequires::ErrorExpressions_NotSF::False>::value' evaluated to false}}
 
   bar<int>();
   // expected-error at -1 {{no matching function for call to 'bar'}} \
diff --git a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c
index 6ce6e37137cd0..55eb5210829d2 100644
--- a/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/bf16-getset-intrinsics.c
@@ -152,21 +152,3 @@ bfloat16x4_t test_vset_lane_bf16(bfloat16_t a, bfloat16x4_t v) {
 bfloat16x8_t test_vsetq_lane_bf16(bfloat16_t a, bfloat16x8_t v) {
   return vsetq_lane_bf16(a, v, 7);
 }
-
-// CHECK-LABEL: @test_vduph_lane_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGET_LANE:%.*]] = extractelement <4 x bfloat> [[V:%.*]], i32 1
-// CHECK-NEXT:    ret bfloat [[VGET_LANE]]
-//
-bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) {
-  return vduph_lane_bf16(v, 1);
-}
-
-// CHECK-LABEL: @test_vduph_laneq_bf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> [[V:%.*]], i32 7
-// CHECK-NEXT:    ret bfloat [[VGETQ_LANE]]
-//
-bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) {
-  return vduph_laneq_bf16(v, 7);
-}
diff --git a/clang/test/CodeGen/AArch64/neon-intrinsics.c b/clang/test/CodeGen/AArch64/neon-intrinsics.c
index bfaea2b8ae909..8eb6cd86339d6 100644
--- a/clang/test/CodeGen/AArch64/neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon-intrinsics.c
@@ -12052,26 +12052,6 @@ uint64_t test_vqsubd_u64(uint64_t a, uint64_t b) {
   return vqsubd_u64(a, b);
 }
 
-// CHECK-LABEL: define dso_local i64 @test_vshld_s64(
-// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[A]], i64 [[B]])
-// CHECK-NEXT:    ret i64 [[VSHLD_S64_I]]
-//
-int64_t test_vshld_s64(int64_t a, int64_t b) {
-  return vshld_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local i64 @test_vshld_u64(
-// CHECK-SAME: i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[VSHLD_U64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[A]], i64 [[B]])
-// CHECK-NEXT:    ret i64 [[VSHLD_U64_I]]
-//
-uint64_t test_vshld_u64(uint64_t a, int64_t b) {
-  return vshld_u64(a, b);
-}
-
 // CHECK-LABEL: define dso_local i8 @test_vqshlb_s8(
 // CHECK-SAME: i8 noundef [[A:%.*]], i8 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -17887,16 +17867,6 @@ uint64x1_t test_vrsra_n_u64(uint64x1_t a, uint64x1_t b) {
   return vrsra_n_u64(a, b, 1);
 }
 
-// CHECK-LABEL: define dso_local i64 @test_vshld_n_s64(
-// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHLD_N:%.*]] = shl i64 [[A]], 1
-// CHECK-NEXT:    ret i64 [[SHLD_N]]
-//
-int64_t test_vshld_n_s64(int64_t a) {
-  return (int64_t)vshld_n_s64(a, 1);
-}
-
 // CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_s64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
@@ -17909,16 +17879,6 @@ int64x1_t test_vshl_n_s64(int64x1_t a) {
   return vshl_n_s64(a, 1);
 }
 
-// CHECK-LABEL: define dso_local i64 @test_vshld_n_u64(
-// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHLD_N:%.*]] = shl i64 [[A]], 63
-// CHECK-NEXT:    ret i64 [[SHLD_N]]
-//
-uint64_t test_vshld_n_u64(uint64_t a) {
-  return (uint64_t)vshld_n_u64(a, 63);
-}
-
 // CHECK-LABEL: define dso_local <1 x i64> @test_vshl_n_u64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon/bf16-getset.c b/clang/test/CodeGen/AArch64/neon/bf16-getset.c
new file mode 100644
index 0000000000000..faae31cb013dd
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/neon/bf16-getset.c
@@ -0,0 +1,36 @@
+// REQUIRES: aarch64-registered-target || arm-registered-target
+
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +bf16 -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +bf16 -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +bf16 -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                               FileCheck %s --check-prefixes=ALL,CIR %}
+
+#include <arm_neon.h>
+
+//=============================================================================
+// NOTES
+//
+// ACLE section headings based on v2025Q2 of the ACLE specification:
+//  * https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#bitwise-equal-to-zero
+//=============================================================================
+
+//===------------------------------------------------------===//
+// 2.4.1.2. Set all lanes to the same value
+//
+// TODO: Add the remaining intrinsics from this group.
+//===------------------------------------------------------===//
+
+// ALL-LABEL: @test_vduph_lane_bf16(
+bfloat16_t test_vduph_lane_bf16(bfloat16x4_t v) {
+  // CIR: cir.vec.extract %{{.*}}[%{{.*}} : !s32i] : !cir.vector<4 x !cir.bf16>
+  // LLVM: [[VGET_LANE:%.*]] = extractelement <4 x bfloat> %{{.*}}, i32 1
+  // LLVM: ret bfloat [[VGET_LANE]]
+  return vduph_lane_bf16(v, 1);
+}
+
+// ALL-LABEL: @test_vduph_laneq_bf16(
+bfloat16_t test_vduph_laneq_bf16(bfloat16x8_t v) {
+  // CIR: cir.vec.extract %{{.*}}[%{{.*}} : !s32i] : !cir.vector<8 x !cir.bf16>
+  // LLVM: [[VGETQ_LANE:%.*]] = extractelement <8 x bfloat> %{{.*}}, i32 7
+  // LLVM: ret bfloat [[VGETQ_LANE]]
+  return vduph_laneq_bf16(v, 7);
+}
diff --git a/clang/test/CodeGen/AArch64/neon/fullfp16.c b/clang/test/CodeGen/AArch64/neon/fullfp16.c
index e2cf5f744af62..619d07538eaaf 100644
--- a/clang/test/CodeGen/AArch64/neon/fullfp16.c
+++ b/clang/test/CodeGen/AArch64/neon/fullfp16.c
@@ -36,7 +36,7 @@
 //===------------------------------------------------------===//
 // 2.5.2.1.  Bitwise equal to zero
 //===------------------------------------------------------===//
-// LLVM-LABEL: test_vceqzh_f16
+// ALL-LABEL: test_vceqzh_f16
 uint16_t test_vceqzh_f16(float16_t a) {
 // CIR:   [[C_0:%.*]] = cir.const #cir.fp<0.000000e+00>
 // CIR:   [[CMP:%.*]] = cir.cmp eq %{{.*}}, [[C_0]] : !cir.f16
@@ -62,7 +62,7 @@ float16_t test_vabsh_f16(float16_t a) {
 
 // ALL-LABEL: @test_vnegh_f16
 float16_t test_vnegh_f16(float16_t a) {
-// CIR: cir.unary(minus, {{.*}}) : !cir.f16
+// CIR: cir.minus {{.*}} : !cir.f16
 
 // LLVM-SAME: half{{.*}} [[A:%.*]])
 // LLVM: [[NEG:%.*]] = fneg half [[A:%.*]]
@@ -82,7 +82,7 @@ float16_t test_vfmah_f16(float16_t a, float16_t b, float16_t c) {
 
 // ALL-LABEL: test_vfmsh_f16
 float16_t test_vfmsh_f16(float16_t a, float16_t b, float16_t c) {
-// CIR: [[SUB:%.*]] = cir.unary(minus, %{{.*}}) : !cir.f16, !cir.f16
+// CIR: [[SUB:%.*]] = cir.minus %{{.*}} : !cir.f16
 // CIR: cir.call_llvm_intrinsic "fma" [[SUB]], {{.*}} : (!cir.f16, !cir.f16, !cir.f16) -> !cir.f16
 
 // LLVM-SAME: half{{.*}} [[A:%.*]], half{{.*}} [[B:%.*]], half{{.*}} [[C:%.*]])
diff --git a/clang/test/CodeGen/AArch64/neon/intrinsics.c b/clang/test/CodeGen/AArch64/neon/intrinsics.c
index b740c3b5b2310..bf8e62feda8da 100644
--- a/clang/test/CodeGen/AArch64/neon/intrinsics.c
+++ b/clang/test/CodeGen/AArch64/neon/intrinsics.c
@@ -1,8 +1,8 @@
 // REQUIRES: aarch64-registered-target || arm-registered-target
 
-// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM
-// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=LLVM %}
-// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                               FileCheck %s --check-prefixes=CIR %}
+// RUN:                   %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none           -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-llvm -o - %s | opt -S -passes=mem2reg,sroa | FileCheck %s --check-prefixes=ALL,LLVM %}
+// RUN: %if cir-enabled %{%clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -disable-O0-optnone -flax-vector-conversions=none -fclangir -emit-cir  -o - %s |                               FileCheck %s --check-prefixes=ALL,CIR %}
 
 //=============================================================================
 // NOTES
@@ -27,7 +27,7 @@
 // LLVM-LABEL: @test_vnegd_s64
 // CIR-LABEL: @vnegd_s64
 int64_t test_vnegd_s64(int64_t a) {
-// CIR: cir.unary(minus, {{.*}}) : !s64
+// CIR: cir.minus {{.*}} : !s64i
 
 // LLVM-SAME: i64 {{.*}} [[A:%.*]])
 // LLVM:          [[VNEGD_I:%.*]] = sub i64 0, [[A]]
@@ -936,3 +936,49 @@ uint32x4_t test_vabaq_u32(uint32x4_t v1, uint32x4_t v2, uint32x4_t v3) {
 // LLVM-NEXT:    ret <4 x i32> [[ADD_I]]
   return vabaq_u32(v1, v2, v3);
 }
+//===------------------------------------------------------===//
+// 2.1.3.1.1. Vector Shift Left
+//===------------------------------------------------------===//
+
+// ALL-LABEL: test_vshld_n_s64
+int64_t test_vshld_n_s64(int64_t a) {
+  // CIR: cir.shift(left, {{.*}})
+
+  // LLVM-SAME: i64 {{.*}} [[A:%.*]])
+  // LLVM: [[SHL_N:%.*]] = shl i64 [[A]], 1
+  // LLVM: ret i64 [[SHL_N]]
+  return (int64_t)vshld_n_s64(a, 1);
+}
+
+// ALL-LABEL: test_vshld_n_u64
+int64_t test_vshld_n_u64(int64_t a) {
+  // CIR: cir.shift(left, {{.*}})
+
+  // LLVM-SAME: i64 {{.*}} [[A:%.*]])
+  // LLVM: [[SHL_N:%.*]] = shl i64 [[A]], 1
+  // LLVM: ret i64 [[SHL_N]]
+  return (int64_t)vshld_n_u64(a, 1);
+}
+
+// LLVM-LABEL: test_vshld_s64
+// CIR-LABEL: vshld_s64
+int64_t test_vshld_s64(int64_t a,int64_t b) {
+ // CIR:  cir.call_llvm_intrinsic "aarch64.neon.sshl" %{{.*}}, %{{.*}} : (!s64i, !s64i) -> !s64i
+
+ // LLVM-SAME: i64 {{.*}} [[A:%.*]], i64 {{.*}} [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+ // LLVM:    [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.sshl.i64(i64 [[A]], i64 [[B]])
+ // LLVM:    ret i64 [[VSHLD_S64_I]]
+  return (int64_t)vshld_s64(a, b);
+}
+
+// LLVM-LABEL: test_vshld_u64
+// CIR-LABEL: vshld_u64
+int64_t test_vshld_u64(int64_t a,int64_t b) {
+ // CIR:  cir.call_llvm_intrinsic "aarch64.neon.ushl" %{{.*}}, %{{.*}} : (!u64i, !s64i) -> !u64i
+
+ // LLVM-SAME: i64 {{.*}} [[A:%.*]], i64 {{.*}} [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+ // LLVM:    [[VSHLD_S64_I:%.*]] = call i64 @llvm.aarch64.neon.ushl.i64(i64 [[A]], i64 [[B]])
+ // LLVM:    ret i64 [[VSHLD_S64_I]]
+  return (int64_t)vshld_u64(a, b);
+}
+
diff --git a/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllexport.cpp b/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllexport.cpp
index f041e587f5ace..7048d32feb9e3 100644
--- a/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllexport.cpp
+++ b/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllexport.cpp
@@ -149,15 +149,15 @@ void usePolymorphic() {
   // GNU-DAG: define linkonce_odr dso_local void @_ZN11PolymorphicI11ImplicitTagE29excludedExportedVirtualMethodEv
 }
 
-/// Test that the DLL attrribute wins over the exclude attribute on a
-/// non-template context.
+/// Test that the exclude attibute takes precedence over the DLL attrribute in
+/// a non-template context also.
 struct NonTemplateClass {
   EXCLUDE_ATTR __declspec(dllexport) void excludedExportedMethod();
 };
 
 void NonTemplateClass::excludedExportedMethod() {}
-// MSC-DAG: define dso_local dllexport void @"?excludedExportedMethod at NonTemplateClass@@QEAAXXZ"
-// GNU-DAG: define dso_local dllexport void @_ZN16NonTemplateClass22excludedExportedMethodEv
+// MSC-DAG: define dso_local void @"?excludedExportedMethod at NonTemplateClass@@QEAAXXZ"
+// GNU-DAG: define dso_local void @_ZN16NonTemplateClass22excludedExportedMethodEv
 
 /// The same, but exporting whole class.
 struct __declspec(dllexport) NonTemplateExportedClass {
@@ -165,5 +165,5 @@ struct __declspec(dllexport) NonTemplateExportedClass {
 };
 
 void NonTemplateExportedClass::excludedMethod() {}
-// MSC-DAG: define dso_local dllexport void @"?excludedMethod at NonTemplateExportedClass@@QEAAXXZ"
-// GNU-DAG: define dso_local dllexport void @_ZN24NonTemplateExportedClass14excludedMethodEv
+// MSC-DAG: define dso_local void @"?excludedMethod at NonTemplateExportedClass@@QEAAXXZ"
+// GNU-DAG: define dso_local void @_ZN24NonTemplateExportedClass14excludedMethodEv
diff --git a/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllimport.cpp b/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllimport.cpp
index d445d0252d905..05e0a6efadf7b 100644
--- a/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllimport.cpp
+++ b/clang/test/CodeGenCXX/attr-exclude_from_explicit_instantiation.exclude_from_dllimport.cpp
@@ -171,23 +171,25 @@ void usePolymorphic() {
   // GNU-DAG: define linkonce_odr dso_local void @_ZN11PolymorphicI11ImplicitTagE29excludedImportedVirtualMethodEv
 }
 
-/// Test that the DLL attrribute wins over the exclude attribute on a
-/// non-template context.
+/// Test that the exclude attibute takes precedence over the DLL attrribute in
+/// a non-template context also.
 struct NonTemplateClass {
   EXCLUDE_ATTR __declspec(dllimport) void excludedImportedMethod();
 };
+void NonTemplateClass::excludedImportedMethod() {}
 
 struct __declspec(dllimport) NonTemplateImportedClass {
   EXCLUDE_ATTR void excludedMethod();
 };
+void NonTemplateImportedClass::excludedMethod() {}
 
 void useNonTemplateClass() {
   NonTemplateClass().excludedImportedMethod();
-  // MSC-DAG: declare dllimport void @"?excludedImportedMethod at NonTemplateClass@@QEAAXXZ"
-  // GNU-DAG: declare dllimport void @_ZN16NonTemplateClass22excludedImportedMethodEv
+  // MSC-DAG: define dso_local void @"?excludedImportedMethod at NonTemplateClass@@QEAAXXZ"
+  // GNU-DAG: define dso_local void @_ZN16NonTemplateClass22excludedImportedMethodEv
 
   NonTemplateImportedClass().excludedMethod();
-  // MSC-DAG: declare dllimport void @"?excludedMethod at NonTemplateImportedClass@@QEAAXXZ"
-  // GNU-DAG: declare dllimport void @_ZN24NonTemplateImportedClass14excludedMethodEv
+  // MSC-DAG: define dso_local void @"?excludedMethod at NonTemplateImportedClass@@QEAAXXZ"
+  // GNU-DAG: define dso_local void @_ZN24NonTemplateImportedClass14excludedMethodEv
 }
 
diff --git a/clang/test/CodeGenCXX/dllexport-inherited-ctor.cpp b/clang/test/CodeGenCXX/dllexport-inherited-ctor.cpp
index cad081fd7f999..03026f843eda6 100644
--- a/clang/test/CodeGenCXX/dllexport-inherited-ctor.cpp
+++ b/clang/test/CodeGenCXX/dllexport-inherited-ctor.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++17 -fms-extensions -O0 -o - %s | FileCheck --check-prefix=MSVC %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc -emit-llvm -std=c++17 -fms-extensions -O0 -o - %s | FileCheck --check-prefix=M32 %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu -emit-llvm -std=c++17 -fms-extensions -O0 -o - %s | FileCheck --check-prefix=GNU %s
-// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++17 -fms-extensions -fno-dllexport-inlines -O0 -o - %s | FileCheck --check-prefix=NOINLINE %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++20 -fms-extensions -O0 -o - %s | FileCheck --check-prefix=MSVC %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple i686-windows-msvc -emit-llvm -std=c++20 -fms-extensions -O0 -o - %s | FileCheck --check-prefix=M32 %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-gnu -emit-llvm -std=c++20 -fms-extensions -O0 -o - %s | FileCheck --check-prefix=GNU %s
+// RUN: %clang_cc1 -no-enable-noundef-analysis -triple x86_64-windows-msvc -emit-llvm -std=c++20 -fms-extensions -fno-dllexport-inlines -O0 -o - %s | FileCheck --check-prefix=NOINLINE %s
 
 // Test that inherited constructors via 'using Base::Base' in a dllexport
 // class are properly exported (https://github.com/llvm/llvm-project/issues/162640).
@@ -241,3 +241,77 @@ struct __declspec(dllexport) CalleeCleanupChild : CalleeCleanupBase {
 // The implicit default ctor is a regular inline method, NOT an inherited
 // constructor, so -fno-dllexport-inlines correctly suppresses it.
 // NOINLINE-NOT: define {{.*}}dllexport{{.*}} @"??0AllDefChild@@QEAA at XZ"
+
+//===----------------------------------------------------------------------===//
+// Constrained constructors: inherited constructors whose requires clause is
+// not satisfied should not be exported.
+// Regression test for https://github.com/llvm/llvm-project/issues/185924
+//===----------------------------------------------------------------------===//
+
+template <bool B>
+struct ConstrainedBase {
+  struct Enabler {};
+  ConstrainedBase(Enabler) requires(B) {}
+  ConstrainedBase() requires(B) : ConstrainedBase(Enabler{}) {}
+  ConstrainedBase(int);
+};
+
+// B=false: both the default ctor and the Enabler ctor have requires(B) which
+// is not satisfied. Only the inherited ConstrainedChild(int) should be
+// exported.
+struct __declspec(dllexport) ConstrainedChild : ConstrainedBase<false> {
+  using ConstrainedBase::ConstrainedBase;
+};
+
+// MSVC-DAG: define weak_odr dso_local dllexport {{.*}} @"??0ConstrainedChild@@QEAA at H@Z"
+// M32-DAG: define weak_odr dso_local dllexport {{.*}} @"??0ConstrainedChild@@QAE at H@Z"
+// GNU-DAG: define {{.*}}dso_local dllexport {{.*}} @_ZN16ConstrainedChildCI115ConstrainedBaseILb0EEEi(
+
+// The constrained constructors should NOT be exported.
+// MSVC-NOT: dllexport{{.*}}ConstrainedChild@@QEAA at XZ
+// M32-NOT: dllexport{{.*}}ConstrainedChild@@QAE at XZ
+// GNU-NOT: dllexport{{.*}}ConstrainedBaseILb0EEEv
+
+// Constrained non-default constructor: only export when the constraint is met.
+template <typename T>
+struct SelectiveBase {
+  SelectiveBase(int) requires(sizeof(T) > 1) {}
+  SelectiveBase(double);
+};
+
+// sizeof(char)==1, so SelectiveBase(int) requires(sizeof(char)>1) is not
+// satisfied. Only the SelectiveChild(double) constructor should be exported.
+struct __declspec(dllexport) SelectiveChild : SelectiveBase<char> {
+  using SelectiveBase::SelectiveBase;
+};
+
+// MSVC-DAG: define weak_odr dso_local dllexport {{.*}} @"??0SelectiveChild@@QEAA at N@Z"
+// M32-DAG: define weak_odr dso_local dllexport {{.*}} @"??0SelectiveChild@@QAE at N@Z"
+// GNU-DAG: define {{.*}}dso_local dllexport {{.*}} @_ZN14SelectiveChildCI113SelectiveBaseIcEEd(
+
+// The constrained int constructor should NOT be exported.
+// MSVC-NOT: dllexport{{.*}}SelectiveChild@@QEAA at H@Z
+// M32-NOT: dllexport{{.*}}SelectiveChild@@QAE at H@Z
+// GNU-NOT: dllexport{{.*}}SelectiveBaseIcEEi
+
+//===----------------------------------------------------------------------===//
+// Non-constructor constrained method: when dllexport propagates to a base
+// template specialization, methods with unsatisfied constraints should not
+// be exported.
+//===----------------------------------------------------------------------===//
+
+template <typename T>
+struct BaseWithConstrainedMethod {
+  void foo() requires(sizeof(T) > 100) { T::nonexistent(); }
+  void bar() {}
+};
+
+struct __declspec(dllexport) MethodChild : BaseWithConstrainedMethod<int> {};
+
+// bar() should be exported (no constraint).
+// MSVC-DAG: define {{.*}}dllexport {{.*}} @"?bar@?$BaseWithConstrainedMethod at H@@QEAAXXZ"
+// M32-DAG: define {{.*}}dllexport {{.*}} @"?bar@?$BaseWithConstrainedMethod at H@@QAEXXZ"
+
+// foo() should NOT be exported (constraint not satisfied).
+// MSVC-NOT: dllexport{{.*}}foo@?$BaseWithConstrainedMethod at H
+// M32-NOT: dllexport{{.*}}foo@?$BaseWithConstrainedMethod at H
diff --git a/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors-new-array.cpp b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors-new-array.cpp
new file mode 100644
index 0000000000000..b8b6e44b6b2f8
--- /dev/null
+++ b/clang/test/CodeGenCXX/microsoft-vector-deleting-dtors-new-array.cpp
@@ -0,0 +1,122 @@
+// RUN: %clang_cc1 -emit-llvm -fms-extensions %s -triple=x86_64-pc-windows-msvc -o - | FileCheck %s
+
+// Test that vector deleting destructors are emitted when new[] is used,
+// even when the destructor definition is in another translation unit.
+
+struct ForwardDeclared {
+  ForwardDeclared();
+  virtual ~ForwardDeclared();
+};
+
+struct DefinedInTU {
+  virtual ~DefinedInTU();
+};
+
+struct NonVirtualDtor {
+  ~NonVirtualDtor();
+};
+
+struct NoDtor {
+  virtual void foo();
+  int x;
+};
+
+struct DeclDerived : ForwardDeclared {
+  ~DeclDerived() override;
+};
+
+struct InlineDefaulted {
+  virtual ~InlineDefaulted() = default;
+};
+
+struct OutOfLineDefaulted {
+  virtual ~OutOfLineDefaulted();
+};
+
+OutOfLineDefaulted::~OutOfLineDefaulted() = default;
+
+template<typename T>
+struct Container {
+  T data;
+  virtual ~Container();
+};
+
+extern template class Container<int>;
+Container<int> *arr = new Container<int>[5];
+
+struct ImplicitVDtorDerived : ForwardDeclared{
+  int data;
+};
+
+struct __declspec(dllimport) DllImported {
+  virtual ~DllImported();
+};
+
+struct VirtualDerived : virtual ForwardDeclared {
+  ~VirtualDerived() override;
+};
+
+struct DeclaredCtorDefinedDtor {
+  DeclaredCtorDefinedDtor();
+  virtual ~DeclaredCtorDefinedDtor() {}
+};
+
+struct TemplateNotAllocated {
+  TemplateNotAllocated();
+  virtual ~TemplateNotAllocated();
+};
+
+struct TemplateAllocated {
+  TemplateAllocated();
+  virtual ~TemplateAllocated();
+};
+
+template <int T>
+void allocate() {
+  TemplateNotAllocated *arr = new TemplateNotAllocated[T];
+}
+
+template <typename T>
+void actuallyAllocate() {
+  T *arr = new T[10];
+  delete[] arr;
+}
+
+void cases() {
+  ForwardDeclared *arr = new ForwardDeclared[5];
+  DefinedInTU *arr1 = new DefinedInTU[5];
+  NonVirtualDtor *arr2 = new NonVirtualDtor[5];
+  NoDtor *arr3 = new NoDtor[5];
+  ForwardDeclared *arr4 = new DeclDerived[5];
+  InlineDefaulted *arr5 = new InlineDefaulted[5];
+  OutOfLineDefaulted *arr6 = new OutOfLineDefaulted[5];
+  ImplicitVDtorDerived *arr7 = new ImplicitVDtorDerived[5];
+  DllImported *arr8 = new DllImported[5];
+  VirtualDerived *arr9 = new VirtualDerived[3];
+  DeclaredCtorDefinedDtor *arr10 = new DeclaredCtorDefinedDtor[5];
+  actuallyAllocate<TemplateAllocated>();
+}
+
+
+// CHECK-DAG: declare dso_local void @"??1ForwardDeclared@@UEAA at XZ"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EForwardDeclared@@UEAAPEAXI at Z"(
+// CHECK-DAG: define dso_local void @"??1DefinedInTU@@UEAA at XZ"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EDefinedInTU@@UEAAPEAXI at Z"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EDeclDerived@@UEAAPEAXI at Z"(
+// CHECK-DAG: declare dso_local void @"??1DeclDerived@@UEAA at XZ"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EInlineDefaulted@@UEAAPEAXI at Z"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EOutOfLineDefaulted@@UEAAPEAXI at Z"(
+// CHECK-DAG: declare dso_local void @"??1?$Container at H@@UEAA at XZ"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_E?$Container at H@@UEAAPEAXI at Z"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EImplicitVDtorDerived@@UEAAPEAXI at Z"(
+// CHECK-DAG: declare dllimport void @"??1DllImported@@UEAA at XZ"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EDllImported@@UEAAPEAXI at Z"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EVirtualDerived@@UEAAPEAXI at Z"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_EDeclaredCtorDefinedDtor@@UEAAPEAXI at Z"(
+// CHECK-DAG: declare dso_local void @"??1TemplateAllocated@@UEAA at XZ"(
+// CHECK-DAG: define weak dso_local noundef ptr @"??_ETemplateAllocated@@UEAAPEAXI at Z"(
+// CHECK-NOT: @"??_ETemplateNotAllocated@@
+// CHECK-NOT: @"??_ENonVirtualDtor@@
+// CHECK-NOT: @"??_ENoDtor@@
+
+DefinedInTU::~DefinedInTU() {}
diff --git a/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl b/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl
index 6756a26bfc124..66bf71b3b0a35 100644
--- a/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl
+++ b/clang/test/CodeGenHLSL/resources/res-array-global-unbounded.hlsl
@@ -30,14 +30,14 @@ void main(uint GI : SV_GroupIndex) {
   // and explicit binding (u10, space1) 
   // CHECK: @hlsl::RWBuffer<float>::__createFromBinding(unsigned int, unsigned int, int, unsigned int, char const*)
   // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer.0") align {{(4|8)}} %[[Tmp0]],
-  // CHECK-SAME: i32 noundef 10, i32 noundef 1, i32 noundef -1, i32 noundef 100, ptr noundef @A.str)
+  // CHECK-SAME: i32 noundef 10, i32 noundef 1, i32 noundef 0, i32 noundef 100, ptr noundef @A.str)
   // CHECK-NEXT: %[[BufPtr:.*]] = call {{.*}} ptr{{.*}} @hlsl::RWBuffer<float>::operator[](unsigned int)(ptr {{.*}} %[[Tmp0]], i32 noundef 0)
   // CHECK-NEXT: %[[Value1:.*]] = load float, ptr{{.*}} %[[BufPtr]], align 4
   // CHECK-NEXT: store float %[[Value1]], ptr %a, align 4
   float a = A[100][0];
 
   // Make sure B[2][3] is translated to a local RWBuffer<int>[4] array where each array element
-  // is initialized by a constructor call with range -1 and index 52-55 and implicit binding 
+  // is initialized by a constructor call with range 0 and index 52-55 and implicit binding 
   // (space 0, order_id 0) 
   // The first index is calculated from the array dimensions (unbounded x 5 x 4) and indices (2, 3)
   // as 2 * 5 * 4 + 3 * 4 = 52 and the following indices are sequential.
@@ -45,22 +45,22 @@ void main(uint GI : SV_GroupIndex) {
   // CHECK-NEXT: %[[Ptr_Tmp2_0:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 0
   // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
   // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_0]], 
-  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 52, ptr noundef @[[BufB]])
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 52, ptr noundef @[[BufB]])
   
   // CHECK-NEXT: %[[Ptr_Tmp2_1:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 1
   // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
   // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_1]], 
-  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 53, ptr noundef @[[BufB]])
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 53, ptr noundef @[[BufB]])
   
   // CHECK-NEXT: %[[Ptr_Tmp2_2:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 2
   // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
   // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_2]], 
-  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 54, ptr noundef @[[BufB]])
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 54, ptr noundef @[[BufB]])
 
   // CHECK-NEXT: %[[Ptr_Tmp2_3:.*]] = getelementptr [4 x %"class.hlsl::RWBuffer"], ptr %[[Tmp2]], i32 0, i32 3
   // CHECK-NEXT: call void @hlsl::RWBuffer<int>::__createFromImplicitBinding(unsigned int, unsigned int, int, unsigned int, char const*)
   // CHECK-SAME: (ptr {{.*}} sret(%"class.hlsl::RWBuffer") align {{(4|8)}} %[[Ptr_Tmp2_3]], 
-  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef -1, i32 noundef 55, ptr noundef @[[BufB]])
+  // CHECK-SAME: i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 55, ptr noundef @[[BufB]])
 
   // DXIL-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 %[[Tmp1]], ptr align 4 %[[Tmp2]], i32 16, i1 false)
   // SPV-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %[[Tmp1]], ptr align 8 %[[Tmp2]], i64 32, i1 false)
diff --git a/clang/test/CodeGenSPIRV/global-dtor.cpp b/clang/test/CodeGenSPIRV/global-dtor.cpp
new file mode 100644
index 0000000000000..da3b1e333a80e
--- /dev/null
+++ b/clang/test/CodeGenSPIRV/global-dtor.cpp
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple spirv64-intel %s -emit-llvm -o - | FileCheck %s
+
+// CHECK: all spir_func addrspace(9) i32 @__cxa_atexit(ptr addrspace(4) addrspacecast (ptr addrspace(9) @{{.*}} to ptr addrspace(4)),
+struct S {
+  ~S() {}
+};
+S s;
diff --git a/clang/test/Driver/autocomplete.c b/clang/test/Driver/autocomplete.c
index 1fd60929751ee..297d2e40d6d83 100644
--- a/clang/test/Driver/autocomplete.c
+++ b/clang/test/Driver/autocomplete.c
@@ -69,6 +69,7 @@
 // FNOSANICOVERALL-NEXT: trace-div
 // FNOSANICOVERALL-NEXT: trace-gep
 // FNOSANICOVERALL-NEXT: trace-pc
+// FNOSANICOVERALL-NEXT: trace-pc-entry-exit
 // FNOSANICOVERALL-NEXT: trace-pc-guard
 // RUN: %clang --autocomplete=-ffp-contract= | FileCheck %s -check-prefix=FFPALL
 // FFPALL: fast
diff --git a/clang/test/Driver/cl-x86-flags.c b/clang/test/Driver/cl-x86-flags.c
index 5b32f17774e27..05960dd07612b 100644
--- a/clang/test/Driver/cl-x86-flags.c
+++ b/clang/test/Driver/cl-x86-flags.c
@@ -222,7 +222,7 @@ void f(void) {
 
 // RUN: %clang_cl --target=x86_64-pc-windows -mapxf -### -- 2>&1 %s | FileCheck -check-prefix=APXF %s
 // RUN: %clang_cl --target=x86_64-pc-windows -mapxf -mno-apxf -### -- 2>&1 %s | FileCheck -check-prefix=NO-APXF %s
-// RUN: %clang_cl --target=x86_64-pc-windows -mapx-features=egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu -### -- 2>&1 %s | FileCheck -check-prefix=APXALL %s
-// APXF: "-target-feature" "+egpr" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+zu"
-// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" "-target-feature" "-zu"
-// APXALL: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu"
+// RUN: %clang_cl --target=x86_64-pc-windows -mapx-features=egpr,push2pop2,ppx,ndd,ccmp,nf,cf,zu,jmpabs -### -- 2>&1 %s | FileCheck -check-prefix=APXALL %s
+// APXF: "-target-feature" "+egpr" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+zu" "-target-feature" "+jmpabs"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" "-target-feature" "-zu" "-target-feature" "-jmpabs"
+// APXALL: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+cf" "-target-feature" "+zu" "-target-feature" "+jmpabs"
diff --git a/clang/test/Driver/darwin-ld-platform-version-macos-nosdk.c b/clang/test/Driver/darwin-ld-platform-version-macos-nosdk.c
new file mode 100644
index 0000000000000..76f601a2f2679
--- /dev/null
+++ b/clang/test/Driver/darwin-ld-platform-version-macos-nosdk.c
@@ -0,0 +1,17 @@
+// UNSUPPORTED: xcselect
+// CLANG_USE_XCSELECT will always have an SDK inferred.
+
+// RUN: touch %t.o
+
+// RUN: %clang -target x86_64-apple-macos10.13 -mlinker-version=520 \
+// RUN:   -### %t.o 2>&1 \
+// RUN:   | FileCheck --check-prefix=NOSDK %s
+// RUN: %clang -target x86_64-apple-darwin17 -mlinker-version=520 \
+// RUN:   -### %t.o 2>&1 \
+// RUN:   | FileCheck --check-prefix=NOSDK %s
+// NOSDK: "-platform_version" "macos" "10.13.0" "10.13.0"
+
+// RUN: %clang -target arm64-apple-macos26 -mlinker-version=520 \
+// RUN:   -### %t.o 2>&1 \
+// RUN:   | FileCheck --check-prefix=VERSION_BUMP %s
+// VERSION_BUMP: "-platform_version" "macos" "26.0.0" "26.0.0"
diff --git a/clang/test/Driver/darwin-ld-platform-version-macos.c b/clang/test/Driver/darwin-ld-platform-version-macos.c
index bdd80c8360402..72c50d0b32b44 100644
--- a/clang/test/Driver/darwin-ld-platform-version-macos.c
+++ b/clang/test/Driver/darwin-ld-platform-version-macos.c
@@ -40,16 +40,3 @@
 // ARM64_NEW: "-platform_version" "macos" "11.0.0" "10.15"
 // ARM64_NEW_1: "-platform_version" "macos" "11.1.0" "10.15"
 // ARM64_OLD: "-macosx_version_min" "11.0.0"
-
-// RUN: %clang -target x86_64-apple-macos10.13 -mlinker-version=520 \
-// RUN:   -### %t.o 2>&1 \
-// RUN:   | FileCheck --check-prefix=NOSDK %s
-// RUN: %clang -target x86_64-apple-darwin17 -mlinker-version=520 \
-// RUN:   -### %t.o 2>&1 \
-// RUN:   | FileCheck --check-prefix=NOSDK %s
-// NOSDK: "-platform_version" "macos" "10.13.0" "10.13.0"
-
-// RUN: %clang -target arm64-apple-macos26 -mlinker-version=520 \
-// RUN:   -### %t.o 2>&1 \
-// RUN:   | FileCheck --check-prefix=VERSION_BUMP %s
-// VERSION_BUMP: "-platform_version" "macos" "26.0.0" "26.0.0"
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-sifive-x160.c b/clang/test/Driver/print-enabled-extensions/riscv-sifive-x160.c
new file mode 100644
index 0000000000000..814c4f4528564
--- /dev/null
+++ b/clang/test/Driver/print-enabled-extensions/riscv-sifive-x160.c
@@ -0,0 +1,59 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang --target=riscv32 -mcpu=sifive-x160 -menable-experimental-extensions --print-enabled-extensions | FileCheck %s
+
+// CHECK:    Name                 Version   Description
+// CHECK-NEXT:    i                    2.1       'I' (Base Integer Instruction Set)
+// CHECK-NEXT:    m                    2.0       'M' (Integer Multiplication and Division)
+// CHECK-NEXT:    a                    2.1       'A' (Atomic Instructions)
+// CHECK-NEXT:    f                    2.2       'F' (Single-Precision Floating-Point)
+// CHECK-NEXT:    c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:    b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
+// CHECK-NEXT:    zicbom               1.0       'Zicbom' (Cache-Block Management Instructions)
+// CHECK-NEXT:    zicbop               1.0       'Zicbop' (Cache-Block Prefetch Instructions)
+// CHECK-NEXT:    zicboz               1.0       'Zicboz' (Cache-Block Zero Instructions)
+// CHECK-NEXT:    zicond               1.0       'Zicond' (Integer Conditional Operations)
+// CHECK-NEXT:    zicsr                2.0       'Zicsr' (CSRs)
+// CHECK-NEXT:    zifencei             2.0       'Zifencei' (fence.i)
+// CHECK-NEXT:    zihintntl            1.0       'Zihintntl' (Non-Temporal Locality Hints)
+// CHECK-NEXT:    zihintpause          2.0       'Zihintpause' (Pause Hint)
+// CHECK-NEXT:    zihpm                2.0       'Zihpm' (Hardware Performance Counters)
+// CHECK-NEXT:    zimop                1.0       'Zimop' (May-Be-Operations)
+// CHECK-NEXT:    zmmul                1.0       'Zmmul' (Integer Multiplication)
+// CHECK-NEXT:    zaamo                1.0       'Zaamo' (Atomic Memory Operations)
+// CHECK-NEXT:    zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
+// CHECK-NEXT:    zawrs                1.0       'Zawrs' (Wait on Reservation Set)
+// CHECK-NEXT:    zfa                  1.0       'Zfa' (Additional Floating-Point)
+// CHECK-NEXT:    zfbfmin              1.0       'Zfbfmin' (Scalar BF16 Converts)
+// CHECK-NEXT:    zfh                  1.0       'Zfh' (Half-Precision Floating-Point)
+// CHECK-NEXT:    zfhmin               1.0       'Zfhmin' (Half-Precision Floating-Point Minimal)
+// CHECK-NEXT:    zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
+// CHECK-NEXT:    zcb                  1.0       'Zcb' (Compressed basic bit manipulation instructions)
+// CHECK-NEXT:    zce                  1.0       'Zce' (Compressed extensions for microcontrollers)
+// CHECK-NEXT:    zcf                  1.0       'Zcf' (Compressed Single-Precision Floating-Point Instructions)
+// CHECK-NEXT:    zcmop                1.0       'Zcmop' (Compressed May-Be-Operations)
+// CHECK-NEXT:    zcmp                 1.0       'Zcmp' (sequenced instructions for code-size reduction)
+// CHECK-NEXT:    zcmt                 1.0       'Zcmt' (table jump instructions for code-size reduction)
+// CHECK-NEXT:    zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:    zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:    zbs                  1.0       'Zbs' (Single-Bit Instructions)
+// CHECK-NEXT:    zkt                  1.0       'Zkt' (Data Independent Execution Latency)
+// CHECK-NEXT:    zvbb                 1.0       'Zvbb' (Vector basic bit-manipulation instructions)
+// CHECK-NEXT:    zve32f               1.0       'Zve32f' (Vector Extensions for Embedded Processors with maximal 32 EEW and F extension)
+// CHECK-NEXT:    zve32x               1.0       'Zve32x' (Vector Extensions for Embedded Processors with maximal 32 EEW)
+// CHECK-NEXT:    zvfbfmin             1.0       'Zvfbfmin' (Vector BF16 Converts)
+// CHECK-NEXT:    zvfbfwma             1.0       'Zvfbfwma' (Vector BF16 widening mul-add)
+// CHECK-NEXT:    zvfh                 1.0       'Zvfh' (Vector Half-Precision Floating-Point)
+// CHECK-NEXT:    zvfhmin              1.0       'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)
+// CHECK-NEXT:    zvkb                 1.0       'Zvkb' (Vector Bit-manipulation used in Cryptography)
+// CHECK-NEXT:    zvkt                 1.0       'Zvkt' (Vector Data-Independent Execution Latency)
+// CHECK-NEXT:    zvl128b              1.0       'Zvl128b' (Minimum Vector Length 128)
+// CHECK-NEXT:    zvl32b               1.0       'Zvl32b' (Minimum Vector Length 32)
+// CHECK-NEXT:    zvl64b               1.0       'Zvl64b' (Minimum Vector Length 64)
+// CHECK-NEXT:    xsfcease             1.0       'XSfcease' (SiFive sf.cease Instruction)
+// CHECK-EMPTY:
+// CHECK-NEXT: Experimental extensions
+// CHECK-NEXT:    zicfilp              1.0       'Zicfilp' (Landing pad)
+// CHECK-NEXT:    zvdot4a8i            0.1       'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers)
+// CHECK-NEXT:    zvfbfa               0.1       'Zvfbfa' (Additional BF16 vector compute support)
+// CHECK-EMPTY:
+// CHECK-NEXT: ISA String: rv32i2p1_m2p0_a2p1_f2p2_c2p0_b1p0_zicbom1p0_zicbop1p0_zicboz1p0_zicfilp1p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfbfmin1p0_zfh1p0_zfhmin1p0_zca1p0_zcb1p0_zce1p0_zcf1p0_zcmop1p0_zcmp1p0_zcmt1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zvdot4a8i0p1_zve32f1p0_zve32x1p0_zvfbfa0p1_zvfbfmin1p0_zvfbfwma1p0_zvfh1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfcease1p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-sifive-x180.c b/clang/test/Driver/print-enabled-extensions/riscv-sifive-x180.c
new file mode 100644
index 0000000000000..1e992dd610682
--- /dev/null
+++ b/clang/test/Driver/print-enabled-extensions/riscv-sifive-x180.c
@@ -0,0 +1,71 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang --target=riscv64 -mcpu=sifive-x180 -menable-experimental-extensions --print-enabled-extensions | FileCheck %s
+
+// CHECK:    Name                 Version   Description
+// CHECK-NEXT:    i                    2.1       'I' (Base Integer Instruction Set)
+// CHECK-NEXT:    m                    2.0       'M' (Integer Multiplication and Division)
+// CHECK-NEXT:    a                    2.1       'A' (Atomic Instructions)
+// CHECK-NEXT:    f                    2.2       'F' (Single-Precision Floating-Point)
+// CHECK-NEXT:    d                    2.2       'D' (Double-Precision Floating-Point)
+// CHECK-NEXT:    c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:    b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
+// CHECK-NEXT:    v                    1.0       'V' (Vector Extension for Application Processors)
+// CHECK-NEXT:    zic64b               1.0       'Zic64b' (Cache Block Size Is 64 Bytes)
+// CHECK-NEXT:    zicbom               1.0       'Zicbom' (Cache-Block Management Instructions)
+// CHECK-NEXT:    zicbop               1.0       'Zicbop' (Cache-Block Prefetch Instructions)
+// CHECK-NEXT:    zicboz               1.0       'Zicboz' (Cache-Block Zero Instructions)
+// CHECK-NEXT:    ziccamoa             1.0       'Ziccamoa' (Main Memory Supports All Atomics in A)
+// CHECK-NEXT:    ziccif               1.0       'Ziccif' (Main Memory Supports Instruction Fetch with Atomicity Requirement)
+// CHECK-NEXT:    ziccrse              1.0       'Ziccrse' (Main Memory Supports Forward Progress on LR/SC Sequences)
+// CHECK-NEXT:    zicond               1.0       'Zicond' (Integer Conditional Operations)
+// CHECK-NEXT:    zicsr                2.0       'Zicsr' (CSRs)
+// CHECK-NEXT:    zifencei             2.0       'Zifencei' (fence.i)
+// CHECK-NEXT:    zihintntl            1.0       'Zihintntl' (Non-Temporal Locality Hints)
+// CHECK-NEXT:    zihintpause          2.0       'Zihintpause' (Pause Hint)
+// CHECK-NEXT:    zihpm                2.0       'Zihpm' (Hardware Performance Counters)
+// CHECK-NEXT:    zimop                1.0       'Zimop' (May-Be-Operations)
+// CHECK-NEXT:    zmmul                1.0       'Zmmul' (Integer Multiplication)
+// CHECK-NEXT:    za64rs               1.0       'Za64rs' (Reservation Set Size of at Most 64 Bytes)
+// CHECK-NEXT:    zaamo                1.0       'Zaamo' (Atomic Memory Operations)
+// CHECK-NEXT:    zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
+// CHECK-NEXT:    zawrs                1.0       'Zawrs' (Wait on Reservation Set)
+// CHECK-NEXT:    zfa                  1.0       'Zfa' (Additional Floating-Point)
+// CHECK-NEXT:    zfbfmin              1.0       'Zfbfmin' (Scalar BF16 Converts)
+// CHECK-NEXT:    zfh                  1.0       'Zfh' (Half-Precision Floating-Point)
+// CHECK-NEXT:    zfhmin               1.0       'Zfhmin' (Half-Precision Floating-Point Minimal)
+// CHECK-NEXT:    zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
+// CHECK-NEXT:    zcb                  1.0       'Zcb' (Compressed basic bit manipulation instructions)
+// CHECK-NEXT:    zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:    zcmop                1.0       'Zcmop' (Compressed May-Be-Operations)
+// CHECK-NEXT:    zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:    zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:    zbs                  1.0       'Zbs' (Single-Bit Instructions)
+// CHECK-NEXT:    zkt                  1.0       'Zkt' (Data Independent Execution Latency)
+// CHECK-NEXT:    zvbb                 1.0       'Zvbb' (Vector basic bit-manipulation instructions)
+// CHECK-NEXT:    zve32f               1.0       'Zve32f' (Vector Extensions for Embedded Processors with maximal 32 EEW and F extension)
+// CHECK-NEXT:    zve32x               1.0       'Zve32x' (Vector Extensions for Embedded Processors with maximal 32 EEW)
+// CHECK-NEXT:    zve64d               1.0       'Zve64d' (Vector Extensions for Embedded Processors with maximal 64 EEW, F and D extension)
+// CHECK-NEXT:    zve64f               1.0       'Zve64f' (Vector Extensions for Embedded Processors with maximal 64 EEW and F extension)
+// CHECK-NEXT:    zve64x               1.0       'Zve64x' (Vector Extensions for Embedded Processors with maximal 64 EEW)
+// CHECK-NEXT:    zvfbfmin             1.0       'Zvfbfmin' (Vector BF16 Converts)
+// CHECK-NEXT:    zvfbfwma             1.0       'Zvfbfwma' (Vector BF16 widening mul-add)
+// CHECK-NEXT:    zvfh                 1.0       'Zvfh' (Vector Half-Precision Floating-Point)
+// CHECK-NEXT:    zvfhmin              1.0       'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)
+// CHECK-NEXT:    zvkb                 1.0       'Zvkb' (Vector Bit-manipulation used in Cryptography)
+// CHECK-NEXT:    zvkt                 1.0       'Zvkt' (Vector Data-Independent Execution Latency)
+// CHECK-NEXT:    zvl128b              1.0       'Zvl128b' (Minimum Vector Length 128)
+// CHECK-NEXT:    zvl32b               1.0       'Zvl32b' (Minimum Vector Length 32)
+// CHECK-NEXT:    zvl64b               1.0       'Zvl64b' (Minimum Vector Length 64)
+// CHECK-NEXT:    xsfcease             1.0       'XSfcease' (SiFive sf.cease Instruction)
+// CHECK-NEXT:    xsfvfbfexp16e        0.5       'XSfvfbfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, BFloat16)
+// CHECK-NEXT:    xsfvfexp16e          0.5       'XSfvfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, Half Precision)
+// CHECK-NEXT:    xsfvfexp32e          0.5       'XSfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction, Single Precision)
+// CHECK-NEXT:    xsfvfexpa            0.2       'XSfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)
+// CHECK-NEXT:    xsifivecflushdlone   1.0       'XSiFivecflushdlone' (SiFive sf.cflush.d.l1 Instruction)
+// CHECK-EMPTY:
+// CHECK-NEXT: Experimental extensions
+// CHECK-NEXT:    zicfilp              1.0       'Zicfilp' (Landing pad)
+// CHECK-NEXT:    zvdot4a8i            0.1       'Zvdot4a8i' (Vector 4-element Dot Product of packed 8-bit Integers)
+// CHECK-NEXT:    zvfbfa               0.1       'Zvfbfa' (Additional BF16 vector compute support)
+// CHECK-EMPTY:
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_ziccrse1p0_zicfilp1p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfbfmin1p0_zfh1p0_zfhmin1p0_zca1p0_zcb1p0_zcd1p0_zcmop1p0_zba1p0_zbb1p0_zbs1p0_zkt1p0_zvbb1p0_zvdot4a8i0p1_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfbfa0p1_zvfbfmin1p0_zvfbfwma1p0_zvfh1p0_zvfhmin1p0_zvkb1p0_zvkt1p0_zvl128b1p0_zvl32b1p0_zvl64b1p0_xsfcease1p0_xsfvfbfexp16e0p5_xsfvfexp16e0p5_xsfvfexp32e0p5_xsfvfexpa0p2_xsifivecflushdlone1p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-spacemit-x100.c b/clang/test/Driver/print-enabled-extensions/riscv-spacemit-x100.c
index e1d7ac6f2d781..8c814cfbdf8a6 100644
--- a/clang/test/Driver/print-enabled-extensions/riscv-spacemit-x100.c
+++ b/clang/test/Driver/print-enabled-extensions/riscv-spacemit-x100.c
@@ -12,6 +12,7 @@
 // CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
 // CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
 // CHECK-NEXT:     v                    1.0       'V' (Vector Extension for Application Processors)
+// CHECK-NEXT:     h                    1.0       'H' (Hypervisor)
 // CHECK-NEXT:     zic64b               1.0       'Zic64b' (Cache Block Size Is 64 Bytes)
 // CHECK-NEXT:     zicbom               1.0       'Zicbom' (Cache-Block Management Instructions)
 // CHECK-NEXT:     zicbop               1.0       'Zicbop' (Cache-Block Prefetch Instructions)
@@ -23,6 +24,7 @@
 // CHECK-NEXT:     zicntr               2.0       'Zicntr' (Base Counters and Timers)
 // CHECK-NEXT:     zicond               1.0       'Zicond' (Integer Conditional Operations)
 // CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
+// CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
 // CHECK-NEXT:     zihintntl            1.0       'Zihintntl' (Non-Temporal Locality Hints)
 // CHECK-NEXT:     zihintpause          2.0       'Zihintpause' (Pause Hint)
 // CHECK-NEXT:     zihpm                2.0       'Zihpm' (Hardware Performance Counters)
@@ -75,9 +77,34 @@
 // CHECK-NEXT:     zvl256b              1.0       'Zvl256b' (Minimum Vector Length 256)
 // CHECK-NEXT:     zvl32b               1.0       'Zvl32b' (Minimum Vector Length 32)
 // CHECK-NEXT:     zvl64b               1.0       'Zvl64b' (Minimum Vector Length 64)
+// CHECK-NEXT:     sha                  1.0       'Sha' (Augmented Hypervisor)
+// CHECK-NEXT:     shcounterenw         1.0       'Shcounterenw' (Support writeable hcounteren enable bit for any hpmcounter that is not read-only zero)
+// CHECK-NEXT:     shgatpa              1.0       'Shgatpa' (SvNNx4 mode supported for all modes supported by satp, as well as Bare)
+// CHECK-NEXT:     shtvala              1.0       'Shtvala' (htval provides all needed values)
+// CHECK-NEXT:     shvsatpa             1.0       'Shvsatpa' (vsatp supports all modes supported by satp)
+// CHECK-NEXT:     shvstvala            1.0       'Shvstvala' (vstval provides all needed values)
+// CHECK-NEXT:     shvstvecd            1.0       'Shvstvecd' (vstvec supports Direct mode)
+// CHECK-NEXT:     smepmp               1.0       'Smepmp' (Enhanced Physical Memory Protection)
+// CHECK-NEXT:     smnpm                1.0       'Smnpm' (Machine-level Pointer Masking for next lower privilege mode)
+// CHECK-NEXT:     smstateen            1.0       'Smstateen' (Machine-mode view of the state-enable extension)
+// CHECK-NEXT:     ssccptr              1.0       'Ssccptr' (Main memory supports page table reads)
+// CHECK-NEXT:     sscofpmf             1.0       'Sscofpmf' (Count Overflow and Mode-Based Filtering)
+// CHECK-NEXT:     sscounterenw         1.0       'Sscounterenw' (Support writeable scounteren enable bit for any hpmcounter that is not read-only zero)
+// CHECK-NEXT:     ssnpm                1.0       'Ssnpm' (Supervisor-level Pointer Masking for next lower privilege mode)
+// CHECK-NEXT:     sspm                 1.0       'Sspm' (Indicates Supervisor-mode Pointer Masking)
+// CHECK-NEXT:     ssstateen            1.0       'Ssstateen' (Supervisor-mode view of the state-enable extension)
+// CHECK-NEXT:     sstc                 1.0       'Sstc' (Supervisor-mode timer interrupts)
+// CHECK-NEXT:     sstvala              1.0       'Sstvala' (stval provides all needed values)
+// CHECK-NEXT:     sstvecd              1.0       'Sstvecd' (stvec supports Direct mode)
+// CHECK-NEXT:     ssu64xl              1.0       'Ssu64xl' (UXLEN=64 supported)
 // CHECK-NEXT:     supm                 1.0       'Supm' (Indicates User-mode Pointer Masking)
+// CHECK-NEXT:     svade                1.0       'Svade' (Raise exceptions on improper A/D bits)
+// CHECK-NEXT:     svbare               1.0       'Svbare' (satp mode Bare supported)
+// CHECK-NEXT:     svinval              1.0       'Svinval' (Fine-Grained Address-Translation Cache Invalidation)
+// CHECK-NEXT:     svnapot              1.0       'Svnapot' (NAPOT Translation Contiguity)
+// CHECK-NEXT:     svpbmt               1.0       'Svpbmt' (Page-Based Memory Types)
 // CHECK-NEXT:     xsmtvdot             1.0       'XSMTVDot' (SpacemiT Vector Dot Product Extension)
 // CHECK-EMPTY:
 // CHECK-NEXT: Experimental extensions
 // CHECK-EMPTY:
-// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfbfmin1p0_zfh1p0_zfhmin1p0_zca1p0_zcb1p0_zcd1p0_zcmop1p0_zba1p0_zbb1p0_zbc1p0_zbkc1p0_zbs1p0_zkt1p0_zvbb1p0_zvbc1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfbfmin1p0_zvfbfwma1p0_zvfh1p0_zvfhmin1p0_zvkb1p0_zvkg1p0_zvkn1p0_zvknc1p0_zvkned1p0_zvkng1p0_zvknha1p0_zvknhb1p0_zvks1p0_zvksc1p0_zvksed1p0_zvksg1p0_zvksh1p0_zvkt1p0_zvl128b1p0_zvl256b1p0_zvl32b1p0_zvl64b1p0_supm1p0_xsmtvdot1p0
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_h1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintntl1p0_zihintpause2p0_zihpm2p0_zimop1p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zawrs1p0_zfa1p0_zfbfmin1p0_zfh1p0_zfhmin1p0_zca1p0_zcb1p0_zcd1p0_zcmop1p0_zba1p0_zbb1p0_zbc1p0_zbkc1p0_zbs1p0_zkt1p0_zvbb1p0_zvbc1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfbfmin1p0_zvfbfwma1p0_zvfh1p0_zvfhmin1p0_zvkb1p0_zvkg1p0_zvkn1p0_zvknc1p0_zvkned1p0_zvkng1p0_zvknha1p0_zvknhb1p0_zvks1p0_zvksc1p0_zvksed1p0_zvksg1p0_zvksh1p0_zvkt1p0_zvl128b1p0_zvl256b1p0_zvl32b1p0_zvl64b1p0_sha1p0_shcounterenw1p0_shgatpa1p0_shtvala1p0_shvsatpa1p0_shvstvala1p0_shvstvecd1p0_smepmp1p0_smnpm1p0_smstateen1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_ssnpm1p0_sspm1p0_ssstateen1p0_sstc1p0_sstvala1p0_sstvecd1p0_ssu64xl1p0_supm1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0_xsmtvdot1p0
diff --git a/clang/test/Driver/print-enabled-extensions/riscv-spacemit-x60.c b/clang/test/Driver/print-enabled-extensions/riscv-spacemit-x60.c
new file mode 100644
index 0000000000000..b2cdf2d0e58c1
--- /dev/null
+++ b/clang/test/Driver/print-enabled-extensions/riscv-spacemit-x60.c
@@ -0,0 +1,70 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang --target=riscv64 -mcpu=spacemit-x60 --print-enabled-extensions | FileCheck %s
+
+// CHECK: Extensions enabled for the given RISC-V target
+// CHECK-EMPTY:
+// CHECK-NEXT:     Name                 Version   Description
+// CHECK-NEXT:     i                    2.1       'I' (Base Integer Instruction Set)
+// CHECK-NEXT:     m                    2.0       'M' (Integer Multiplication and Division)
+// CHECK-NEXT:     a                    2.1       'A' (Atomic Instructions)
+// CHECK-NEXT:     f                    2.2       'F' (Single-Precision Floating-Point)
+// CHECK-NEXT:     d                    2.2       'D' (Double-Precision Floating-Point)
+// CHECK-NEXT:     c                    2.0       'C' (Compressed Instructions)
+// CHECK-NEXT:     b                    1.0       'B' (the collection of the Zba, Zbb, Zbs extensions)
+// CHECK-NEXT:     v                    1.0       'V' (Vector Extension for Application Processors)
+// CHECK-NEXT:     zic64b               1.0       'Zic64b' (Cache Block Size Is 64 Bytes)
+// CHECK-NEXT:     zicbom               1.0       'Zicbom' (Cache-Block Management Instructions)
+// CHECK-NEXT:     zicbop               1.0       'Zicbop' (Cache-Block Prefetch Instructions)
+// CHECK-NEXT:     zicboz               1.0       'Zicboz' (Cache-Block Zero Instructions)
+// CHECK-NEXT:     ziccamoa             1.0       'Ziccamoa' (Main Memory Supports All Atomics in A)
+// CHECK-NEXT:     ziccif               1.0       'Ziccif' (Main Memory Supports Instruction Fetch with Atomicity Requirement)
+// CHECK-NEXT:     zicclsm              1.0       'Zicclsm' (Main Memory Supports Misaligned Loads/Stores)
+// CHECK-NEXT:     ziccrse              1.0       'Ziccrse' (Main Memory Supports Forward Progress on LR/SC Sequences)
+// CHECK-NEXT:     zicntr               2.0       'Zicntr' (Base Counters and Timers)
+// CHECK-NEXT:     zicond               1.0       'Zicond' (Integer Conditional Operations)
+// CHECK-NEXT:     zicsr                2.0       'Zicsr' (CSRs)
+// CHECK-NEXT:     zifencei             2.0       'Zifencei' (fence.i)
+// CHECK-NEXT:     zihintpause          2.0       'Zihintpause' (Pause Hint)
+// CHECK-NEXT:     zihpm                2.0       'Zihpm' (Hardware Performance Counters)
+// CHECK-NEXT:     zmmul                1.0       'Zmmul' (Integer Multiplication)
+// CHECK-NEXT:     za64rs               1.0       'Za64rs' (Reservation Set Size of at Most 64 Bytes)
+// CHECK-NEXT:     zaamo                1.0       'Zaamo' (Atomic Memory Operations)
+// CHECK-NEXT:     zalrsc               1.0       'Zalrsc' (Load-Reserved/Store-Conditional)
+// CHECK-NEXT:     zfh                  1.0       'Zfh' (Half-Precision Floating-Point)
+// CHECK-NEXT:     zfhmin               1.0       'Zfhmin' (Half-Precision Floating-Point Minimal)
+// CHECK-NEXT:     zca                  1.0       'Zca' (part of the C extension, excluding compressed floating point loads/stores)
+// CHECK-NEXT:     zcd                  1.0       'Zcd' (Compressed Double-Precision Floating-Point Instructions)
+// CHECK-NEXT:     zba                  1.0       'Zba' (Address Generation Instructions)
+// CHECK-NEXT:     zbb                  1.0       'Zbb' (Basic Bit-Manipulation)
+// CHECK-NEXT:     zbc                  1.0       'Zbc' (Carry-Less Multiplication)
+// CHECK-NEXT:     zbkc                 1.0       'Zbkc' (Carry-less multiply instructions for Cryptography)
+// CHECK-NEXT:     zbs                  1.0       'Zbs' (Single-Bit Instructions)
+// CHECK-NEXT:     zkt                  1.0       'Zkt' (Data Independent Execution Latency)
+// CHECK-NEXT:     zve32f               1.0       'Zve32f' (Vector Extensions for Embedded Processors with maximal 32 EEW and F extension)
+// CHECK-NEXT:     zve32x               1.0       'Zve32x' (Vector Extensions for Embedded Processors with maximal 32 EEW)
+// CHECK-NEXT:     zve64d               1.0       'Zve64d' (Vector Extensions for Embedded Processors with maximal 64 EEW, F and D extension)
+// CHECK-NEXT:     zve64f               1.0       'Zve64f' (Vector Extensions for Embedded Processors with maximal 64 EEW and F extension)
+// CHECK-NEXT:     zve64x               1.0       'Zve64x' (Vector Extensions for Embedded Processors with maximal 64 EEW)
+// CHECK-NEXT:     zvfh                 1.0       'Zvfh' (Vector Half-Precision Floating-Point)
+// CHECK-NEXT:     zvfhmin              1.0       'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)
+// CHECK-NEXT:     zvkt                 1.0       'Zvkt' (Vector Data-Independent Execution Latency)
+// CHECK-NEXT:     zvl128b              1.0       'Zvl128b' (Minimum Vector Length 128)
+// CHECK-NEXT:     zvl256b              1.0       'Zvl256b' (Minimum Vector Length 256)
+// CHECK-NEXT:     zvl32b               1.0       'Zvl32b' (Minimum Vector Length 32)
+// CHECK-NEXT:     zvl64b               1.0       'Zvl64b' (Minimum Vector Length 64)
+// CHECK-NEXT:     ssccptr              1.0       'Ssccptr' (Main memory supports page table reads)
+// CHECK-NEXT:     sscofpmf             1.0       'Sscofpmf' (Count Overflow and Mode-Based Filtering)
+// CHECK-NEXT:     sscounterenw         1.0       'Sscounterenw' (Support writeable scounteren enable bit for any hpmcounter that is not read-only zero)
+// CHECK-NEXT:     sstc                 1.0       'Sstc' (Supervisor-mode timer interrupts)
+// CHECK-NEXT:     sstvala              1.0       'Sstvala' (stval provides all needed values)
+// CHECK-NEXT:     sstvecd              1.0       'Sstvecd' (stvec supports Direct mode)
+// CHECK-NEXT:     svade                1.0       'Svade' (Raise exceptions on improper A/D bits)
+// CHECK-NEXT:     svbare               1.0       'Svbare' (satp mode Bare supported)
+// CHECK-NEXT:     svinval              1.0       'Svinval' (Fine-Grained Address-Translation Cache Invalidation)
+// CHECK-NEXT:     svnapot              1.0       'Svnapot' (NAPOT Translation Contiguity)
+// CHECK-NEXT:     svpbmt               1.0       'Svpbmt' (Page-Based Memory Types)
+// CHECK-NEXT:     xsmtvdot             1.0       'XSMTVDot' (SpacemiT Vector Dot Product Extension)
+// CHECK-EMPTY:
+// CHECK-NEXT: Experimental extensions
+// CHECK-EMPTY:
+// CHECK-NEXT: ISA String: rv64i2p1_m2p0_a2p1_f2p2_d2p2_c2p0_b1p0_v1p0_zic64b1p0_zicbom1p0_zicbop1p0_zicboz1p0_ziccamoa1p0_ziccif1p0_zicclsm1p0_ziccrse1p0_zicntr2p0_zicond1p0_zicsr2p0_zifencei2p0_zihintpause2p0_zihpm2p0_zmmul1p0_za64rs1p0_zaamo1p0_zalrsc1p0_zfh1p0_zfhmin1p0_zca1p0_zcd1p0_zba1p0_zbb1p0_zbc1p0_zbkc1p0_zbs1p0_zkt1p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvfh1p0_zvfhmin1p0_zvkt1p0_zvl128b1p0_zvl256b1p0_zvl32b1p0_zvl64b1p0_ssccptr1p0_sscofpmf1p0_sscounterenw1p0_sstc1p0_sstvala1p0_sstvecd1p0_svade1p0_svbare1p0_svinval1p0_svnapot1p0_svpbmt1p0_xsmtvdot1p0
diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index 64b8cc27974fd..5ca33d91452e0 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -112,62 +112,13 @@
 // MTUNE-SPACEMIT-A100: "-tune-cpu" "spacemit-a100"
 
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=spacemit-x60 | FileCheck -check-prefix=MCPU-SPACEMIT-X60 %s
-// MCPU-SPACEMIT-X60: "-nostdsysteminc" "-target-cpu" "spacemit-x60"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+m"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+a"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+f"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+d"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+c"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+v"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zic64b"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zicbom"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zicbop"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zicboz"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+ziccamoa"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+ziccif"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zicclsm"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+ziccrse"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zicntr"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zicond"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zicsr"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zifencei"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zihintpause"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zihpm"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+za64rs"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zfh"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zfhmin"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zba"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zbb"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zbc"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zbkc"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zbs"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zkt"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zve32f"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zve32x"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zve64d"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zve64f"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zve64x"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zvfh"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zvfhmin"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zvkt"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zvl128b"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zvl256b"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zvl32b"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+zvl64b"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+ssccptr"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+sscofpmf"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+sscounterenw"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+sstc"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+sstvala"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+sstvecd"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svade"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svbare"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svinval"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svnapot"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+svpbmt"
-// MCPU-SPACEMIT-X60-SAME: "-target-feature" "+xsmtvdot"
+// MCPU-SPACEMIT-X60: "-target-cpu" "spacemit-x60"
+// COM: The list of extensions are tested in `test/Driver/print-enabled-extensions/riscv-spacemit-x60.c`
 // MCPU-SPACEMIT-X60-SAME: "-target-abi" "lp64d"
 
+// RUN: %clang --target=riscv64 -### -c %s 2>&1 -mtune=spacemit-x60 | FileCheck -check-prefix=MTUNE-SPACEMIT-X60 %s
+// MTUNE-SPACEMIT-X60: "-tune-cpu" "spacemit-x60"
+
 // RUN: %clang --target=riscv64 -### -c %s 2>&1 -mcpu=spacemit-x100 | FileCheck -check-prefix=MCPU-SPACEMIT-X100 %s
 // MCPU-SPACEMIT-X100: "-target-cpu" "spacemit-x100"
 // COM: The list of extensions are tested in `test/Driver/print-enabled-extensions/riscv-spacemit-x100.c`
@@ -471,6 +422,16 @@
 // COM: The list of extensions are tested in `test/Driver/print-enabled-extensions/riscv-sifive-x390.c`
 // MCPU-SIFIVE-X390-SAME: "-target-abi" "lp64d"
 
+// RUN: %clang -target riscv32 -### -c %s 2>&1 -menable-experimental-extensions -mcpu=sifive-x160 | FileCheck -check-prefix=MCPU-SIFIVE-X160 %s
+// MCPU-SIFIVE-X160: "-target-cpu" "sifive-x160"
+// COM: The list of extensions are tested in `test/Driver/print-enabled-extensions/riscv-sifive-x160.c`
+// MCPU-SIFIVE-X160-SAME: "-target-abi" "ilp32f"
+
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -menable-experimental-extensions -mcpu=sifive-x180 | FileCheck -check-prefix=MCPU-SIFIVE-X180 %s
+// MCPU-SIFIVE-X180: "-target-cpu" "sifive-x180"
+// COM: The list of extensions are tested in `test/Driver/print-enabled-extensions/riscv-sifive-x180.c`
+// MCPU-SIFIVE-X180-SAME: "-target-abi" "lp64d"
+
 // RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p450 | FileCheck -check-prefix=MCPU-SIFIVE-P450 %s
 // MCPU-SIFIVE-P450: "-nostdsysteminc" "-target-cpu" "sifive-p450"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+m"
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 99eef8e4da172..b6ca38c92003f 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -444,8 +444,8 @@
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mno-apxf -mapxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=APXF %s
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapxf -mno-apxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-APXF %s
 //
-// APXF: "-target-feature" "+egpr" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+zu" "-target-feature" "+push2pop2" "-target-feature" "+ppx"
-// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" "-target-feature" "-zu" "-target-feature" "-push2pop2" "-target-feature" "-ppx"
+// APXF: "-target-feature" "+egpr" "-target-feature" "+ndd" "-target-feature" "+ccmp" "-target-feature" "+nf" "-target-feature" "+zu"  "-target-feature" "+jmpabs" "-target-feature" "+push2pop2" "-target-feature" "+ppx"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-ndd" "-target-feature" "-ccmp" "-target-feature" "-nf" "-target-feature" "-zu" "-target-feature" "-jmpabs" "-target-feature" "-push2pop2" "-target-feature" "-ppx"
 
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=egpr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR %s
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=push2pop2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
@@ -455,6 +455,7 @@
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=nf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NF %s
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=cf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=CF %s
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=zu %s -### -o %t.o 2>&1 | FileCheck -check-prefix=ZU %s
+// RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=jmpabs %s -### -o %t.o 2>&1 | FileCheck -check-prefix=JMPABS %s
 // EGPR: "-target-feature" "+egpr"
 // PUSH2POP2: "-target-feature" "+push2pop2"
 // PPX: "-target-feature" "+ppx"
@@ -463,6 +464,7 @@
 // NF: "-target-feature" "+nf"
 // CF: "-target-feature" "+cf"
 // ZU: "-target-feature" "+zu"
+// JMPABS: "-target-feature" "+jmpabs"
 
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=egpr,ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s
 // RUN: %clang --target=x86_64-unknown-linux-gnu -mapx-features=egpr -mapx-features=ndd %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR-NDD %s
diff --git a/clang/test/Driver/xcselect.c b/clang/test/Driver/xcselect.c
new file mode 100644
index 0000000000000..01cd4aca5ec23
--- /dev/null
+++ b/clang/test/Driver/xcselect.c
@@ -0,0 +1,5 @@
+// REQUIRES: xcselect
+// RUN: %clang -target arm64-apple-macosx -c -### %s 2> %t.log
+// RUN: FileCheck %s <%t.log
+
+// CHECK: "-isysroot" "{{.*}}/SDKs/MacOSX{{([0-9]+(\.[0-9]+)?)?}}.sdk"
diff --git a/clang/test/Misc/target-invalid-cpu-note/riscv.c b/clang/test/Misc/target-invalid-cpu-note/riscv.c
index 07513c0a5bc6b..5223d1f968b8f 100644
--- a/clang/test/Misc/target-invalid-cpu-note/riscv.c
+++ b/clang/test/Misc/target-invalid-cpu-note/riscv.c
@@ -17,6 +17,7 @@
 // RISCV32-SAME: {{^}}, sifive-e31
 // RISCV32-SAME: {{^}}, sifive-e34
 // RISCV32-SAME: {{^}}, sifive-e76
+// RISCV32-SAME: {{^}}, sifive-x160
 // RISCV32-SAME: {{^}}, syntacore-scr1-base
 // RISCV32-SAME: {{^}}, syntacore-scr1-max
 // RISCV32-SAME: {{^}}, syntacore-scr3-rv32
@@ -47,6 +48,7 @@
 // RISCV64-SAME: {{^}}, sifive-s76
 // RISCV64-SAME: {{^}}, sifive-u54
 // RISCV64-SAME: {{^}}, sifive-u74
+// RISCV64-SAME: {{^}}, sifive-x180
 // RISCV64-SAME: {{^}}, sifive-x280
 // RISCV64-SAME: {{^}}, sifive-x390
 // RISCV64-SAME: {{^}}, spacemit-a100
@@ -79,6 +81,7 @@
 // TUNE-RISCV32-SAME: {{^}}, sifive-e31
 // TUNE-RISCV32-SAME: {{^}}, sifive-e34
 // TUNE-RISCV32-SAME: {{^}}, sifive-e76
+// TUNE-RISCV32-SAME: {{^}}, sifive-x160
 // TUNE-RISCV32-SAME: {{^}}, syntacore-scr1-base
 // TUNE-RISCV32-SAME: {{^}}, syntacore-scr1-max
 // TUNE-RISCV32-SAME: {{^}}, syntacore-scr3-rv32
@@ -114,6 +117,7 @@
 // TUNE-RISCV64-SAME: {{^}}, sifive-s76
 // TUNE-RISCV64-SAME: {{^}}, sifive-u54
 // TUNE-RISCV64-SAME: {{^}}, sifive-u74
+// TUNE-RISCV64-SAME: {{^}}, sifive-x180
 // TUNE-RISCV64-SAME: {{^}}, sifive-x280
 // TUNE-RISCV64-SAME: {{^}}, sifive-x390
 // TUNE-RISCV64-SAME: {{^}}, spacemit-a100
diff --git a/clang/test/Modules/pr170099.cppm b/clang/test/Modules/pr170099.cppm
new file mode 100644
index 0000000000000..ca3b4c10fe729
--- /dev/null
+++ b/clang/test/Modules/pr170099.cppm
@@ -0,0 +1,20 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple -std=c++26 -O3 -emit-llvm %s -o - | FileCheck %s
+module;
+
+struct A {};
+
+struct B {
+	int x;
+	A   a;
+	constexpr B(char *) { x = int(); }
+	~B();
+};
+
+struct C {
+	B b = "";
+} inline c{};
+
+export module foo;
+
+// Just to make sure it won't crash
+// CHECK: @_ZGIW3foo
diff --git a/clang/test/Modules/pr186603.cppm b/clang/test/Modules/pr186603.cppm
new file mode 100644
index 0000000000000..199efde757ffc
--- /dev/null
+++ b/clang/test/Modules/pr186603.cppm
@@ -0,0 +1,22 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// Test that duplicate inline function definitions in different partitions
+// of the same named module are diagnosed as ODR violations.
+// See https://github.com/llvm/llvm-project/issues/186603
+//
+// Case 1: Module interface imports partition and redefines inline function
+// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/partition1.cppm -o %t/A-P1.pcm
+// RUN: %clang_cc1 -std=c++20 -emit-reduced-module-interface %t/module1.cppm \
+// RUN:   -fmodule-file=A:P1=%t/A-P1.pcm -o %t/A.pcm -verify
+
+//--- partition1.cppm
+export module A:P1;
+export inline void x() {}
+
+//--- module1.cppm
+export module A;
+import :P1;
+export inline void x() {} // expected-error {{redefinition of 'x'}}
+// expected-note at partition1.cppm:* {{previous definition is here}}
\ No newline at end of file
diff --git a/clang/test/OpenMP/declare_target_ast_print.cpp b/clang/test/OpenMP/declare_target_ast_print.cpp
index 68f73d5433595..3ebe261cf79f0 100644
--- a/clang/test/OpenMP/declare_target_ast_print.cpp
+++ b/clang/test/OpenMP/declare_target_ast_print.cpp
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
 // RUN: %clang_cc1 -verify -fopenmp -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=52 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=60 -I %S/Inputs -ast-print %s | FileCheck %s --check-prefix=CHECK --check-prefix=OMP60
 
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP50
@@ -12,6 +12,8 @@
 // RUN: %clang_cc1 -fopenmp -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP51
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
 // RUN: %clang_cc1 -fopenmp -fopenmp-version=52 -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP52
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -fopenmp-version=60 -std=c++11 -include-pch %t -I %S/Inputs -verify %s -ast-print | FileCheck %s --check-prefix=CHECK --check-prefix=OMP60
 
 // RUN: %clang_cc1 -verify -fopenmp-simd -I %S/Inputs -ast-print %s | FileCheck %s
 // RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -I %S/Inputs -emit-pch -o %t %s
@@ -123,6 +125,41 @@ void xoo();
 }
 #endif // _OPENMP
 
+#if _OPENMP == 202411
+int l1;
+#pragma omp declare target local(l1) device_type(any)
+// OMP60: #pragma omp declare target local
+// OMP60: int l1;
+// OMP60: #pragma omp end declare target
+
+int l2;
+#pragma omp declare target device_type(nohost) local(l2)
+// OMP60: #pragma omp declare target device_type(nohost) local
+// OMP60: int l2;
+// OMP60: #pragma omp end declare target
+
+int l3;
+int a = 0;
+#pragma omp declare target local(l3) device_type(nohost) local(a)
+// OMP60: #pragma omp declare target device_type(nohost) local
+// OMP60: int l3;
+// OMP60: #pragma omp end declare target
+// OMP60: #pragma omp declare target device_type(nohost) local
+// OMP60: int a = 0;
+// OMP60: #pragma omp end declare target
+
+int b = 0, c = 0;
+#pragma omp declare target local(b,c)
+void zoo();
+// OMP60: #pragma omp declare target local
+// OMP60: int b = 0;
+// OMP60: #pragma omp end declare target
+// OMP60: #pragma omp declare target local
+// OMP60: int c = 0;
+// OMP60: #pragma omp end declare target
+// OMP60: void zoo();
+#endif // _OPENMP
+
 int out_decl_target = 0;
 #pragma omp declare target (out_decl_target)
 
@@ -373,6 +410,13 @@ int x;
 // CHECK-NEXT: int x;
 // CHECK-NEXT: #pragma omp end declare target
 
+#if _OPENMP >= 202411
+int x_local;
+// OMP60: #pragma omp declare target local
+// OMP60-NEXT: int x_local;
+// OMP60-NEXT: #pragma omp end declare target
+#endif // _OPENMP
+
 int main (int argc, char **argv) {
   foo();
   foo_c();
@@ -388,6 +432,10 @@ int main (int argc, char **argv) {
 #endif
 
   #pragma omp declare target link(x)
+
+#if _OPENMP >= 202411
+  #pragma omp declare target local(x_local)
+#endif // _OPENMP
   return (0);
 }
 
diff --git a/clang/test/OpenMP/declare_target_messages.cpp b/clang/test/OpenMP/declare_target_messages.cpp
index 26c47ecfb36b0..9875bd95141fd 100644
--- a/clang/test/OpenMP/declare_target_messages.cpp
+++ b/clang/test/OpenMP/declare_target_messages.cpp
@@ -13,7 +13,7 @@
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host5,host-5-and-51,no-host5-and-51  %{openmp50} %{target_mac} %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var  %{openmp60} %{target_mac} %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host-5-and-51,no-host5-and-51,dev5  %{openmp50} -fopenmp-is-target-device %{target_mac} %{aux_triple} %{limit} -o - %s
-// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} -fopenmp-is-target-device %{target_mac} %{aux_triple} %{limit} -o - %s
+// RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,dev60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60} -fopenmp-is-target-device %{target_mac} %{aux_triple} %{limit} -o - %s
 
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp5,ompvar,omp45-to-51,omp5-and-51,omp5-or-later,omp5-or-later-var,omp45-to-51-var,omp45-to-51-clause,host5,host-5-and-51,no-host5-and-51 %{openmp50_simd} %{target_mac} %{limit} -o - %s
 // RUN: %clang_cc1 %{common_opts_mac} -verify=expected,omp60,omp52-or-later,ompvar,omp5-or-later,omp5-or-later-var %{openmp60_simd} %{target_mac} %{limit} -o - %s
@@ -61,8 +61,8 @@ void f();
 // omp45-to-51-warning at +1 {{extra tokens at the end of '#pragma omp end declare target' are ignored}}
 #pragma omp end declare target shared(a) 
 
-// omp60-error at +10 {{unexpected 'map' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
-// omp60-error at +9 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp60-error at +10 {{unexpected 'map' clause, only 'enter', 'link', 'device_type', 'indirect' or 'local' clauses expected}}
+// omp60-error at +9 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-error at +8 {{unexpected 'map' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
 // omp52-error at +7 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp51-error at +6 {{unexpected 'map' clause, only 'to', 'link', 'device_type' or 'indirect' clauses expected}} 
@@ -74,7 +74,7 @@ void f();
 #pragma omp declare target map(a)
 
 // omp60-error at +5 {{unexpected 'to' clause, use 'enter' instead}}
-// omp60-error at +4 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp60-error at +4 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-error at +3 {{unexpected 'to' clause, use 'enter' instead}}
 // omp52-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-to-51-error at +1 {{use of undeclared identifier 'foo1'}}
@@ -83,8 +83,13 @@ void f();
 // expected-error at +1 {{use of undeclared identifier 'foo2'}}
 #pragma omp declare target link(foo2) 
 
+#if _OPENMP == 202411
+// omp60-error at +1 {{use of undeclared identifier 'foo3'}}
+#pragma omp declare target local(foo3) 
+#endif // _OPENMP
+
 // omp60-error at +6 {{unexpected 'to' clause, use 'enter' instead}}
-// omp60-error at +5 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp60-error at +5 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
 // omp52-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // dev5-note at +2 {{marked as 'device_type(host)' here}}
@@ -92,8 +97,10 @@ void f();
 #pragma omp declare target to(f) device_type(host)
 
 void q();
-// omp52-or-later-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp60-error at +6 {{unexpected 'to' clause, use 'enter' instead}}
+// omp60-error at +5 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
+// omp52-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
+// omp52-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp5-and-51-warning at +2 {{more than one 'device_type' clause is specified}}
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(q) device_type(any) device_type(any) device_type(host) 
@@ -133,13 +140,24 @@ void c();
 // expected-note at +1 {{'func' defined here}}
 void func() {} 
 
-// omp52-or-later-error at +5 {{unexpected 'allocate' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
+// omp60-error at +6 {{unexpected 'allocate' clause, only 'enter', 'link', 'device_type', 'indirect' or 'local' clauses expected}}
+// omp52-error at +5 {{unexpected 'allocate' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
 // omp51-error at +4 {{unexpected 'allocate' clause, only 'to', 'link', 'device_type' or 'indirect' clauses expected}}
 // omp5-error at +3 {{unexpected 'allocate' clause, only 'to', 'link' or 'device_type' clauses expected}}
 // expected-error at +2 {{function name is not allowed in 'link' clause}}
 // omp45-error at +1 {{unexpected 'allocate' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target link(func) allocate(a)
 
+#if _OPENMP == 202411
+// expected-note at +1 {{'func_local' defined here}}
+void func_local() {} 
+
+// dev60-warning at +3 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}}
+// omp60-error at +2 {{unexpected 'allocate' clause, only 'enter', 'link', 'device_type', 'indirect' or 'local' clauses expected}}
+// expected-error at +1 {{function name is not allowed in 'local' clause}}
+#pragma omp declare target local(func_local) allocate(a)
+#endif // _OPENMP
+
 void bar();
 void baz() {bar();}
 // omp5-or-later-warning at +1 {{declaration marked as declare target after first use, it may lead to incorrect results}}
@@ -282,11 +300,13 @@ int main (int argc, char **argv) {
 #pragma omp end declare target 
   foo(v);
 
-  // omp52-or-later-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+  // omp60-error at +3 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
+  // omp52-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
   // omp52-or-later-error at +1 {{unexpected 'to' clause, use 'enter' instead}}
 #pragma omp declare target to(foo3) link(w)
+  // omp60-error at +4 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
   // omp52-or-later-error at +3 {{unexpected 'to' clause, use 'enter' instead}}
-  // omp52-or-later-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+  // omp52-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
   // omp45-to-51-var-error at +1 {{local variable 'a' should not be used in 'declare target' directive}}
 #pragma omp declare target to(a) 
   return (0);
@@ -301,50 +321,114 @@ namespace {
 // expected-error at +1 {{'S' used in declare target directive is not a variable or a function name}}
 #pragma omp declare target link(S) 
 
+#if _OPENMP == 202411
+// omp60-error at +1 {{'S' used in declare target directive is not a variable or a function name}}
+#pragma omp declare target local(S)
+
+int x_local;
+// expected-error at +1 {{'x_local' appears multiple times in clauses on the same declare target directive}}
+#pragma omp declare target enter(x_local) local(x_local)
+
+int y_enter_local;
+#pragma omp declare target enter(y_enter_local)
+// expected-error at +1 {{'y_enter_local' must not appear in both clauses 'enter' and 'local'}}
+#pragma omp declare target local(y_enter_local)
+
+int y_local_enter;
+// dev60-warning at +1 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}}
+#pragma omp declare target local(y_local_enter)
+// expected-error at +1 {{'y_local_enter' must not appear in both clauses 'local' and 'enter'}}
+#pragma omp declare target enter(y_local_enter)
+
+int y_link_local;
+#pragma omp declare target link(y_link_local)
+// expected-error at +1 {{'y_link_local' must not appear in both clauses 'link' and 'local'}}
+#pragma omp declare target local(y_link_local)
+
+int y_local_link;
+// dev60-warning at +1 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}}
+#pragma omp declare target local(y_local_link)
+// expected-error at +1 {{'y_local_link' must not appear in both clauses 'local' and 'link'}}
+#pragma omp declare target link(y_local_link)
+
+int y_local_dev;
+// omp60-error at +1 {{'local' clause is incompatible with 'device_type(host)'; local variables exist only on the device}}
+#pragma omp declare_target local(y_local_dev) device_type(host)
+#endif // _OPENMP
+
 // expected-error at +1 {{'x' appears multiple times in clauses on the same declare target directive}}
 #pragma omp declare target (x, x) 
+
+// omp52-or-later-error at +2 {{unexpected clause after an implicit 'enter' clause}}
+// omp45-to-51-error at +1 {{unexpected clause after an implicit 'to' clause}}
+#pragma omp declare target (x) local(x)
+
+// omp60-error at +4 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +3 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-to-51-clause-error at +1 {{'x' appears multiple times in clauses on the same declare target directive}}
 #pragma omp declare target to(x) to(x)
 // expected-error at +1 {{'x' must not appear in both clauses 'to' and 'link'}}
 #pragma omp declare target link(x) 
 
+#if defined(_OPENMP) && _OPENMP < 202111
+int y_link_to;
+#pragma omp declare target link(y_link_to)
+// omp45-to-51-error at +1 {{'y_link_to' must not appear in both clauses 'link' and 'to'}}
+#pragma omp declare target to(y_link_to)
+#elif defined(_OPENMP) && _OPENMP >= 202111
+int y_link_enter;
+#pragma omp declare target link(y_link_enter)
+// omp52-or-later-error at +1 {{'y_link_enter' must not appear in both clauses 'link' and 'enter'}}
+#pragma omp declare target enter(y_link_enter)
+
+int y_enter_link;
+#pragma omp declare target enter(y_enter_link)
+// omp52-or-later-error at +1 {{'y_enter_link' must not appear in both clauses 'enter' and 'link'}}
+#pragma omp declare target link(y_enter_link)
+#endif
+
 void bazz() {}
+// omp60-error at +5 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // host5-note at +2 3 {{marked as 'device_type(nohost)' here}}
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} 
 #pragma omp declare target to(bazz) device_type(nohost)
 void bazzz() {bazz();}
+// omp60-error at +4 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +3 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(bazzz) device_type(nohost) 
 // host5-error at +1 {{function with 'device_type(nohost)' is not available on host}}
 void any() {bazz();} 
 // host5-error at +1 {{function with 'device_type(nohost)' is not available on host}}
 void host1() {bazz();}
+// omp60-error at +5 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // dev5-note at +2 3 {{marked as 'device_type(host)' here}}
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(host1) device_type(host)
 //host5-error at +1 {{function with 'device_type(nohost)' is not available on host}}
 void host2() {bazz();}
+// omp60-error at +3 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +2 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 #pragma omp declare target to(host2) 
 // dev5-error at +1 {{function with 'device_type(host)' is not available on device}}
 void device() {host1();}
+// omp60-error at +5 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // host5-note at +2 2 {{marked as 'device_type(nohost)' here}} 
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(device) device_type(nohost)
 void host3() {host1();} // dev5-error {{function with 'device_type(host)' is not available on device}}
+// omp60-error at +3 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +2 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +1 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 #pragma omp declare target to(host3)
 
 #pragma omp begin declare target
@@ -363,21 +447,36 @@ void any7() {device();}
 void any8() {any2();}
 
 int MultiDevTy;
+// omp60-error at +4 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +3 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +2 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(MultiDevTy) device_type(any)
+// omp60-error at +5 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // host-5-and-51-error at +2 {{'device_type(host)' does not match previously specified 'device_type(any)' for the same declaration}}
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(MultiDevTy) device_type(host)
+// omp60-error at +5 {{expected at least one 'enter', 'link', 'indirect' or 'local' clause}}
 // omp52-or-later-error at +4 {{unexpected 'to' clause, use 'enter' instead}}
-// omp52-or-later-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp52-error at +3 {{expected at least one 'enter', 'link' or 'indirect' clause}}
 // no-host5-and-51-error at +2 {{'device_type(nohost)' does not match previously specified 'device_type(any)' for the same declaration}}
 // omp45-error at +1 {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}}
 #pragma omp declare target to(MultiDevTy) device_type(nohost)
 
+int counter = 0;
+// dev60-warning at +9 {{'local' clause on 'declare_target' directive is not yet fully implemented; variable will be treated as 'enter'}}
+// omp52-error at +8 {{unexpected 'local' clause, only 'enter', 'link', 'device_type' or 'indirect' clauses expected}}
+// omp52-error at +7 {{expected at least one 'enter', 'link' or 'indirect' clause}}
+// omp51-error at +6 {{unexpected 'local' clause, only 'to', 'link', 'device_type' or 'indirect' clauses expected}}
+// omp51-error at +5 {{expected at least one 'to', 'link' or 'indirect' clause}}
+// omp5-error at +4 {{expected at least one 'to' or 'link' clause}} 
+// omp5-error at +3 {{unexpected 'local' clause, only 'to', 'link' or 'device_type' clauses expected}}
+// omp45-error at +2 {{expected at least one 'to' or 'link' clause}} 
+// omp45-error at +1 {{unexpected 'local' clause, only 'to' or 'link' clauses expected}}
+#pragma omp declare target local(counter)
+
 // expected-warning at +1 {{declaration is not declared in any declare target region}}
 static int variable = 100; 
 static float variable1 = 200;
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 8cab87e80624b..cb2d13d59d8bf 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -1876,6 +1876,7 @@
 // CHECK_GNR_M32: #define __FMA__ 1
 // CHECK_GNR_M32: #define __GFNI__ 1
 // CHECK_GNR_M32: #define __INVPCID__ 1
+// CHECK_DMR_M32: #define __JMPABS__ 1
 // CHECK_GNR_M32: #define __LZCNT__ 1
 // CHECK_GNR_M32: #define __MMX__ 1
 // CHECK_GNR_M32: #define __MOVBE__ 1
@@ -1981,6 +1982,7 @@
 // CHECK_GNR_M64: #define __FMA__ 1
 // CHECK_GNR_M64: #define __GFNI__ 1
 // CHECK_GNR_M64: #define __INVPCID__ 1
+// CHECK_DMR_M64: #define __JMPABS__ 1
 // CHECK_GNR_M64: #define __LZCNT__ 1
 // CHECK_GNR_M64: #define __MMX__ 1
 // CHECK_GNR_M64: #define __MOVBE__ 1
@@ -2576,6 +2578,7 @@
 // CHECK_ARL_M32: #define __GFNI__ 1
 // CHECK_ARL_M32: #define __HRESET__ 1
 // CHECK_ARL_M32: #define __INVPCID__ 1
+// CHECK_NVL_M32: #define __JMPABS__ 1
 // CHECK_KL_M32:  #define __KL__ 1
 // CHECK_NKL_M32-NOT: __KL__
 // CHECK_ARL_M32: #define __LZCNT__ 1
@@ -2709,6 +2712,7 @@
 // CHECK_ARL_M64: #define __GFNI__ 1
 // CHECK_ARL_M64: #define __HRESET__ 1
 // CHECK_ARL_M64: #define __INVPCID__ 1
+// CHECK_NVL_M64: #define __JMPABS__ 1
 // CHECK_KL_M64:  #define __KL__ 1
 // CHECK_NKL_M64-NOT: __KL__
 // CHECK_ARL_M64: #define __LZCNT__ 1
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 78f8b19459c2f..daf81d71e41c0 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -738,11 +738,13 @@
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=nf -x c -E -dM -o - %s | FileCheck --check-prefix=NF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=zu -x c -E -dM -o - %s | FileCheck --check-prefix=ZU %s
-// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,ZU,APXF %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=jmpabs -x c -E -dM -o - %s | FileCheck --check-prefix=JMPABS %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD,CCMP,NF,ZU,JMPABS,APXF %s
 // APXF: #define __APX_F__ 1
 // CCMP: #define __CCMP__ 1
 // CF: #define __CF__ 1
 // EGPR: #define __EGPR__ 1
+// JMPABS: #define __JMPABS__ 1
 // NDD: #define __NDD__ 1
 // NF: #define __NF__ 1
 // PPX: #define __PPX__ 1
diff --git a/clang/test/Sema/Inputs/lifetime-analysis.h b/clang/test/Sema/Inputs/lifetime-analysis.h
index 56cacdd964f79..0b6bdaef83f9d 100644
--- a/clang/test/Sema/Inputs/lifetime-analysis.h
+++ b/clang/test/Sema/Inputs/lifetime-analysis.h
@@ -176,6 +176,9 @@ struct basic_string {
   basic_string& operator+=(const basic_string&);
   basic_string& operator+=(const T*);
   void push_back(T);
+
+  template<class StringViewLike> basic_string& insert(size_t index, const StringViewLike&);
+
   void clear();
   const T *c_str() const;
   operator basic_string_view<T> () const;
diff --git a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
index 3305e9e270d86..4996664d26452 100644
--- a/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
+++ b/clang/test/Sema/warn-lifetime-analysis-nocfg.cpp
@@ -1287,3 +1287,8 @@ void test() {
     const auto ptrTSC = StringTemplateSpecC<char>().data();  // Both have attribute         // expected-warning {{temporary whose address is used}}
 }
 } // namespace GH175391
+
+void string_insert_GH_186817() {
+    std::string msg;
+    msg.insert(0, std::string_view(std::string("a temporary")));
+}
diff --git a/clang/test/SemaCXX/attr-exclude_from_explicit_instantiation.ignore-dllattr.cpp b/clang/test/SemaCXX/attr-exclude_from_explicit_instantiation.ignore-dllattr.cpp
index 5943f99e14cad..00e1c74c34bc3 100644
--- a/clang/test/SemaCXX/attr-exclude_from_explicit_instantiation.ignore-dllattr.cpp
+++ b/clang/test/SemaCXX/attr-exclude_from_explicit_instantiation.ignore-dllattr.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -triple x86_64-win32 -fms-extensions -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple x86_64-mingw                 -fsyntax-only -verify %s
-// RUN: %clang_cc1 -triple x86_64-cygwin                -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-win32 -fms-extensions -fsyntax-only -verify=expected,msc %s
+// RUN: %clang_cc1 -triple x86_64-mingw                 -fsyntax-only -verify=expected,gnu %s
+// RUN: %clang_cc1 -triple x86_64-cygwin                -fsyntax-only -verify=expected,gnu %s
 
 // Test that attaching the exclude_from_explicit_instantiation attribute and
 // either the dllexport or dllimport attribute together causes a warning.
@@ -92,30 +92,30 @@ template struct __declspec(dllexport) class_tmpl_explicit_inst<int>;
 
 // Test that exclude_from_explicit_instantiation is ignored in a non-template context.
 struct class_nontmpl {
-  EXCLUDE_ATTR __declspec(dllexport) void fn_excluded_exported(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  EXCLUDE_ATTR __declspec(dllimport) void fn_excluded_imported(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  __declspec(dllexport) EXCLUDE_ATTR void fn_exported_excluded(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  __declspec(dllimport) EXCLUDE_ATTR void fn_imported_excluded(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
+  EXCLUDE_ATTR __declspec(dllexport) void fn_excluded_exported(); // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  EXCLUDE_ATTR __declspec(dllimport) void fn_excluded_imported(); // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  __declspec(dllexport) EXCLUDE_ATTR void fn_exported_excluded(); // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  __declspec(dllimport) EXCLUDE_ATTR void fn_imported_excluded(); // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
-  EXCLUDE_ATTR __declspec(dllexport) static int var_excluded_exported; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  EXCLUDE_ATTR __declspec(dllimport) static int var_excluded_imported; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  __declspec(dllexport) EXCLUDE_ATTR static int var_exported_excluded; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  __declspec(dllimport) EXCLUDE_ATTR static int var_imported_excluded; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
+  EXCLUDE_ATTR __declspec(dllexport) static int var_excluded_exported; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  EXCLUDE_ATTR __declspec(dllimport) static int var_excluded_imported; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  __declspec(dllexport) EXCLUDE_ATTR static int var_exported_excluded; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  __declspec(dllimport) EXCLUDE_ATTR static int var_imported_excluded; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
-  struct EXCLUDE_ATTR __declspec(dllexport) nested_excluded_exported {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  struct EXCLUDE_ATTR __declspec(dllimport) nested_excluded_imported {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  struct __declspec(dllexport) EXCLUDE_ATTR nested_exported_excluded {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
-  struct __declspec(dllimport) EXCLUDE_ATTR nested_imported_excluded {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
+  struct EXCLUDE_ATTR __declspec(dllexport) nested_excluded_exported {}; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  struct EXCLUDE_ATTR __declspec(dllimport) nested_excluded_imported {}; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  struct __declspec(dllexport) EXCLUDE_ATTR nested_exported_excluded {}; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+  struct __declspec(dllimport) EXCLUDE_ATTR nested_imported_excluded {}; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
   template <class T>
-  struct EXCLUDE_ATTR __declspec(dllexport) class_template_excluded {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
+  struct EXCLUDE_ATTR __declspec(dllexport) class_template_excluded {}; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
   template <class T>
-  EXCLUDE_ATTR __declspec(dllexport) static T var_template_excluded; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
+  EXCLUDE_ATTR __declspec(dllexport) static T var_template_excluded; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
   template <class T>
-  EXCLUDE_ATTR __declspec(dllexport) void fn_template_excluded(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
+  EXCLUDE_ATTR __declspec(dllexport) void fn_template_excluded(); // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
   struct nested {
-    EXCLUDE_ATTR __declspec(dllexport) void fn_excluded_exported(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored in a non-template context}}
+    EXCLUDE_ATTR __declspec(dllexport) void fn_excluded_exported(); // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
   };
 
   struct EXCLUDE_ATTR nested_excluded {
@@ -135,27 +135,27 @@ struct class_nontmpl {
 };
 
 // Test that exclude_from_explicit_instantiation is ignored on a non-member entity.
-EXCLUDE_ATTR __declspec(dllexport) void fn_excluded_exported(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-EXCLUDE_ATTR __declspec(dllimport) void fn_excluded_imported(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-__declspec(dllexport) EXCLUDE_ATTR void fn_exported_excluded(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-__declspec(dllimport) EXCLUDE_ATTR void fn_imported_excluded(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
+EXCLUDE_ATTR __declspec(dllexport) void fn_excluded_exported(); // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+EXCLUDE_ATTR __declspec(dllimport) void fn_excluded_imported(); // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+__declspec(dllexport) EXCLUDE_ATTR void fn_exported_excluded(); // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+__declspec(dllimport) EXCLUDE_ATTR void fn_imported_excluded(); // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
-EXCLUDE_ATTR __declspec(dllexport) int var_excluded_exported; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-EXCLUDE_ATTR __declspec(dllimport) int var_excluded_imported; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-__declspec(dllexport) EXCLUDE_ATTR int var_exported_excluded; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-__declspec(dllimport) EXCLUDE_ATTR int var_imported_excluded; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
+EXCLUDE_ATTR __declspec(dllexport) int var_excluded_exported; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+EXCLUDE_ATTR __declspec(dllimport) int var_excluded_imported; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+__declspec(dllexport) EXCLUDE_ATTR int var_exported_excluded; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+__declspec(dllimport) EXCLUDE_ATTR int var_imported_excluded; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
-struct EXCLUDE_ATTR __declspec(dllexport) class_excluded_exported {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-struct EXCLUDE_ATTR __declspec(dllimport) class_excluded_imported {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-struct __declspec(dllexport) EXCLUDE_ATTR class_exported_excluded {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
-struct __declspec(dllimport) EXCLUDE_ATTR class_imported_excluded {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
+struct EXCLUDE_ATTR __declspec(dllexport) class_excluded_exported {}; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+struct EXCLUDE_ATTR __declspec(dllimport) class_excluded_imported {}; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+struct __declspec(dllexport) EXCLUDE_ATTR class_exported_excluded {}; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
+struct __declspec(dllimport) EXCLUDE_ATTR class_imported_excluded {}; // expected-warning{{'dllimport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
 template <class T>
-struct EXCLUDE_ATTR __declspec(dllexport) class_tmpl_excluded_exported {}; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
+struct EXCLUDE_ATTR __declspec(dllexport) class_tmpl_excluded_exported {}; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 template <class T>
-EXCLUDE_ATTR __declspec(dllexport) T var_template_excluded; // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
+EXCLUDE_ATTR __declspec(dllexport) T var_template_excluded; // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 template <class T>
-EXCLUDE_ATTR __declspec(dllexport) void fn_template_excluded(); // expected-warning{{'exclude_from_explicit_instantiation' attribute ignored on a non-member declaration}}
+EXCLUDE_ATTR __declspec(dllexport) void fn_template_excluded(); // expected-warning{{'dllexport' attribute ignored; 'exclude_from_explicit_instantiation' takes precedence}}
 
 EXCLUDE_ATTR void fn_excluded();
 
@@ -167,6 +167,24 @@ struct EXCLUDE_ATTR class_excluded {
 struct __declspec(dllexport) class_exported {
   EXCLUDE_ATTR void fn_excluded();
 };
-struct __declspec(dllimport) class_imported {
+
+// Test that an excluded member in an imported class can have its definition without any warning.
+struct __declspec(dllimport) class_imported { // #import
+  void fn(); // expected-note{{previous declaration is here}}
   EXCLUDE_ATTR void fn_excluded();
+  static int var;
+  EXCLUDE_ATTR static int var_excluded;
 };
+
+void class_imported::fn() {}
+  // msc-warning at -1{{'class_imported::fn' redeclared without 'dllimport' attribute: 'dllexport' attribute added}}
+  // gnu-warning at -2{{'class_imported::fn' redeclared without 'dllimport' attribute: previous 'dllimport' ignored}}
+  // gnu-note@#import {{previous attribute is here}}
+
+void class_imported::fn_excluded() {}
+
+int class_imported::var = 0;
+  // expected-error at -1{{definition of dllimport static field not allowed}}
+  // expected-note@#import{{attribute is here}}
+
+int class_imported::var_excluded = 0;
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 995397f65b20c..efa5e06f8a32c 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -1257,13 +1257,13 @@ void f() {
     (&A::e)(a, a);
     // expected-error at -1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::e<A>)(a, 0);
     (&A::e<A>)(a, a);
     // expected-error at -1 {{no matching function for call to 'e'}} \
     // expected-note@#tpl-address-e{{candidate template ignored: constraints not satisfied [with T = A, U = A]}} \
-    // expected-note@#tpl-address-e{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-e{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::e<A, int>)(a, 0);
 
@@ -1273,12 +1273,12 @@ void f() {
     (&A::f<A>)(a);
     // expected-error at -1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::f)(a);
     // expected-error at -1 {{no matching function for call to 'f'}} \
     // expected-note@#tpl-address-f{{candidate template ignored: constraints not satisfied [with T = A]}} \
-    // expected-note@#tpl-address-f{{because '__is_same(A, int)' evaluated to false}}
+    // expected-note@#tpl-address-f{{because '__is_same(tpl_address::A, int)' evaluated to false}}
 
     (&A::g)(a);
     (&A::g)(a, 0);
diff --git a/clang/test/SemaCXX/deleted-template-spec-diag.cpp b/clang/test/SemaCXX/deleted-template-spec-diag.cpp
new file mode 100644
index 0000000000000..fe224b83c808d
--- /dev/null
+++ b/clang/test/SemaCXX/deleted-template-spec-diag.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -std=c++11 -verify %s
+
+// https://github.com/llvm/llvm-project/issues/185693
+// Explicitly deleted function template specializations were incorrectly
+// reported as "implicitly deleted" in overload resolution diagnostics.
+
+template <typename T> void fred(const T &x);
+template <> void fred(const double &) = delete; // expected-note {{explicitly deleted}}
+
+int main() {
+  fred(8.0); // expected-error {{call to deleted function 'fred'}}
+}
diff --git a/clang/test/SemaCXX/dllexport-constrained-inherited-ctor.cpp b/clang/test/SemaCXX/dllexport-constrained-inherited-ctor.cpp
new file mode 100644
index 0000000000000..019f0a17bdf1e
--- /dev/null
+++ b/clang/test/SemaCXX/dllexport-constrained-inherited-ctor.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple x86_64-windows-msvc -fsyntax-only -fms-extensions -verify -std=c++20 %s
+// RUN: %clang_cc1 -triple x86_64-windows-gnu  -fsyntax-only -fms-extensions -verify -std=c++20 %s
+
+// expected-no-diagnostics
+
+// Regression test for https://github.com/llvm/llvm-project/issues/185924
+// dllexport should not attempt to instantiate inherited constructors whose
+// requires clause is not satisfied.
+//
+// This exercises two paths in checkClassLevelDLLAttribute:
+//   1) findInheritingConstructor must skip constrained-out base ctors
+//   2) dllexport propagated to the base template specialization must not
+//      export members whose requires clause is not satisfied
+//
+// The constructor/method bodies are intentionally ill-formed when the
+// constraint is not satisfied, so that forced instantiation via dllexport
+// would produce an error without the correct fix.
+
+template <bool B>
+struct ConstrainedBase {
+  struct Enabler {};
+  ConstrainedBase(Enabler) requires(B) {}
+  ConstrainedBase() requires(B) : ConstrainedBase(Enabler{}) {}
+  ConstrainedBase(int);
+};
+
+struct __declspec(dllexport) ConstrainedChild : ConstrainedBase<false> {
+  using ConstrainedBase::ConstrainedBase;
+};
+
+// Non-constructor constrained method on a base template specialization.
+// When dllexport propagates to the base, methods whose requires clause
+// is not satisfied must be skipped.
+template <typename T>
+struct BaseWithConstrainedMethod {
+  void foo() requires(sizeof(T) > 100) { T::nonexistent(); }
+  void bar() {}
+};
+
+struct __declspec(dllexport) MethodChild : BaseWithConstrainedMethod<int> {};
diff --git a/clang/test/SemaCXX/gh134265.cpp b/clang/test/SemaCXX/gh134265.cpp
index 790165411c938..421197cf3d7f7 100644
--- a/clang/test/SemaCXX/gh134265.cpp
+++ b/clang/test/SemaCXX/gh134265.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -verify=expected -fsyntax-only -triple=x86_64-unknown-linux-gnu
-// RUN: %clang_cc1 %s -verify=expected -fsyntax-only -triple=x86_64-unknown-linux-gnu -std=c++20
+// RUN: %clang_cc1 %s -verify=expected,noms -fsyntax-only -triple=x86_64-unknown-linux-gnu
+// RUN: %clang_cc1 %s -verify=expected,noms -fsyntax-only -triple=x86_64-unknown-linux-gnu -std=c++20
 // RUN: %clang_cc1 %s -verify=expected,ms -fms-extensions -fms-compatibility -triple=x86_64-pc-windows-msvc -DMS
 
 // Verify that clang doesn't emit additional errors when searching for
@@ -56,7 +56,34 @@ struct Final1 : BaseDelete1, BaseDelete2, BaseDestructor {
 };
 #endif // MS
 
+// Make sure there is no double diagnosing for declared-only destructors and
+// new[].
+struct DeclaredOnly {
+  virtual ~DeclaredOnly(); // ms-error {{attempt to use a deleted function}}
+  static void operator delete(void* ptr) = delete; // ms-note {{explicitly marked deleted here}}
+};
+
+struct DeclaredOnlyArr {
+  virtual ~DeclaredOnlyArr();
+  static void operator delete[](void* ptr) = delete;
+};
+
 void foo() {
     Final* a = new Final[10]();
     FinalExplicit* b = new FinalExplicit[10]();
+    DeclaredOnly *d = new DeclaredOnly[5]();
+    DeclaredOnlyArr *e = new DeclaredOnlyArr[5]();
 }
+
+// Make sure there is no double diagnosing for forward declared destructors
+// and new[].
+namespace std { struct destroying_delete_t {}; }
+struct A {
+  void operator delete(
+      A*, //expected-error {{cannot cast 'D' to its private base class 'A'}}
+      std::destroying_delete_t);
+};
+struct B : private A { using A::operator delete; }; //expected-note {{declared private here}}
+struct D : B { virtual ~D(); }; //ms-note {{while checking implicit 'delete this' for virtual destructor}}
+void f() { new D[5]; }
+D::~D() {} // noms-note {{while checking implicit 'delete this' for virtual destructor}}
diff --git a/clang/test/SemaTemplate/GH176152.cpp b/clang/test/SemaTemplate/GH176152.cpp
new file mode 100644
index 0000000000000..7d61aa292982d
--- /dev/null
+++ b/clang/test/SemaTemplate/GH176152.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify %s
+
+template <class T> int f(T) {
+  struct MyClass {
+    static int staticField;
+    // expected-error at -1 {{static data member 'staticField' not allowed in local struct 'MyClass'}}
+  };
+  int MyClass::staticField = 42;
+  return 0;
+}
+
+int x = f(0);
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index f270e7b4e7912..029d15ff2cecf 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -73,8 +73,8 @@ auto it = begin(rng); // #BEGIN_CALL
 // expected-note@#INF_BEGIN_EXPR {{while substituting deduced template arguments into function template 'begin'}}
 // expected-note@#INF_BEGIN_EXPR {{in instantiation of requirement here}}
 // expected-note@#INF_REQ {{while substituting template arguments into constraint expression here}}
-// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<struct my_range>' requested here}}
-// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<struct my_range>' required here}}
+// expected-note@#INF_BEGIN {{while checking the satisfaction of concept 'Inf<DirectRecursiveCheck::my_range>' requested here}}
+// expected-note@#BEGIN_CALL {{while checking constraint satisfaction for template 'begin<DirectRecursiveCheck::my_range>' required here}}
 // expected-note@#BEGIN_CALL {{while substituting deduced template arguments into function template}}
 
 // Fallout of the failure is failed lookup, which is necessary to stop odd
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 321b69f79f2c8..429df756c1c4f 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1659,6 +1659,25 @@ void foo() { call(""); }
 
 }
 
+namespace GH186624 {
+
+template <class T>
+concept C = __is_unsigned(T);
+
+template <C T>
+struct encoder_interface {};
+
+template <template <C> class CodecInterface, C T>
+CodecInterface<T>* create_codec() {
+  return nullptr;
+}
+
+encoder_interface<unsigned>* create_encoder() {
+  return create_codec<encoder_interface, unsigned>();
+}
+
+}
+
 namespace GH170856 {
 
 template <unsigned N, unsigned M> struct symbol_text {
@@ -1753,14 +1772,37 @@ void f() = delete;
 
 struct Bar {};
 
-template <typename T> using Foo = Bar;
+template <typename> using Foo = Bar;
+
+template <int T>
+  requires true
+void f2() {}
+
+template <int T>
+  requires false
+void f2() = delete;
+
+template <int> constexpr auto Value = 1;
 
-template <typename T> void use() {
+template <template <typename> class> using FooTemp = Bar;
+
+template <typename T, int N, template <typename> class C> void use() {
   f<Foo<T>>();
+  f2<Value<N>>();
+  f<FooTemp<C>>();
 }
 
 }
 
+namespace instantiation_dependent {
+  template <class T> concept C = sizeof(T) >= 1;
+  template <class U> using X = int;
+  template <class V> requires C<X<V&>> struct Y {};
+  Y<void> y;
+  // expected-error at -1 {{constraints not satisfied for class template 'Y' [with V = void]}}
+  // expected-note at -3  {{because substituted constraint expression is ill-formed: cannot form a reference to 'void'}}
+} // namespace instantiation_dependent
+
 namespace GH174667 {
 
 template<class T, class, class U>
diff --git a/clang/test/SemaTemplate/deduction-crash.cpp b/clang/test/SemaTemplate/deduction-crash.cpp
index 99ca0b365ff6f..e7018fd0d8338 100644
--- a/clang/test/SemaTemplate/deduction-crash.cpp
+++ b/clang/test/SemaTemplate/deduction-crash.cpp
@@ -172,3 +172,8 @@ namespace PR51872_part1 {
   // expected-error at -1 {{no viable constructor or deduction guide for deduction of template arguments of 'T1'}}
   // expected-note at -7  {{candidate template ignored: could not match 'PR51872_part1::T1<value-parameter-0-0>' against 'int'}}
 }
+
+namespace GH177545 {
+  template<decltype(auto)()() volatile throw() -> char> // expected-error {{'decltype(auto)' can only be used as a return type in a function declaration}}
+  struct T2;                                            // expected-error@* {{function cannot return function type 'auto () volatile throw() -> decltype(auto)'}}
+}
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 183b674dcddae..3b0b3092d424e 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -429,6 +429,8 @@ def calculate_arch_features(arch_string):
 if config.clang_enable_cir:
     config.available_features.add("cir-enabled")
 
+if config.use_xcselect:
+    config.available_features.add("xcselect")
 
 # Tests that rely on chmod to restrict file permissions (e.g. write-permission
 # checks) are unreliable when run as root, since root bypasses file permissions.
diff --git a/clang/test/lit.site.cfg.py.in b/clang/test/lit.site.cfg.py.in
index 3bdff42262164..cb35118167d99 100644
--- a/clang/test/lit.site.cfg.py.in
+++ b/clang/test/lit.site.cfg.py.in
@@ -47,6 +47,7 @@ config.ppc_linux_default_ieeelongdouble = @PPC_LINUX_DEFAULT_IEEELONGDOUBLE@
 config.have_llvm_driver = @LLVM_TOOL_LLVM_DRIVER_BUILD@
 config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
 config.substitutions.append(("%llvm-version-major", "@LLVM_VERSION_MAJOR@"))
+config.use_xcselect = @CLANG_USE_XCSELECT@
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/clang/tools/CMakeLists.txt b/clang/tools/CMakeLists.txt
index 6523471776cf1..891043ec31f77 100644
--- a/clang/tools/CMakeLists.txt
+++ b/clang/tools/CMakeLists.txt
@@ -15,6 +15,8 @@ add_clang_subdirectory(clang-linker-wrapper)
 add_clang_subdirectory(clang-nvlink-wrapper)
 add_clang_subdirectory(clang-offload-bundler)
 add_clang_subdirectory(clang-scan-deps)
+add_clang_subdirectory(clang-ssaf-format)
+add_clang_subdirectory(clang-ssaf-linker)
 add_clang_subdirectory(clang-sycl-linker)
 add_clang_subdirectory(clang-installapi)
 if(HAVE_CLANG_REPL_SUPPORT)
@@ -53,5 +55,3 @@ add_llvm_external_project(clang-tools-extra extra)
 add_clang_subdirectory(libclang)
 
 add_clang_subdirectory(offload-arch)
-add_clang_subdirectory(ssaf-format)
-add_clang_subdirectory(ssaf-linker)
diff --git a/clang/tools/ssaf-format/CMakeLists.txt b/clang/tools/clang-ssaf-format/CMakeLists.txt
similarity index 100%
rename from clang/tools/ssaf-format/CMakeLists.txt
rename to clang/tools/clang-ssaf-format/CMakeLists.txt
diff --git a/clang/tools/ssaf-format/SSAFFormat.cpp b/clang/tools/clang-ssaf-format/SSAFFormat.cpp
similarity index 98%
rename from clang/tools/ssaf-format/SSAFFormat.cpp
rename to clang/tools/clang-ssaf-format/SSAFFormat.cpp
index 9b5312e0e085b..497d7437a08ae 100644
--- a/clang/tools/ssaf-format/SSAFFormat.cpp
+++ b/clang/tools/clang-ssaf-format/SSAFFormat.cpp
@@ -15,6 +15,7 @@
 #include "clang/ScalableStaticAnalysisFramework/Core/EntityLinker/TUSummaryEncoding.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h"
+#include "clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h" // IWYU pragma: keep
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
@@ -152,14 +153,11 @@ SerializationFormat *getFormatForExtension(llvm::StringRef Extension) {
     return It->second.get();
   }
 
-  // SerializationFormats are uppercase while file extensions are lowercase.
-  std::string CapitalizedExtension = Extension.upper();
-
-  if (!isFormatRegistered(CapitalizedExtension)) {
+  if (!isFormatRegistered(Extension)) {
     return nullptr;
   }
 
-  auto Format = makeFormat(CapitalizedExtension);
+  auto Format = makeFormat(Extension);
   SerializationFormat *Result = Format.get();
   assert(Result);
 
@@ -470,8 +468,6 @@ int main(int argc, const char **argv) {
 
   loadPlugins();
 
-  initializeJSONFormat();
-
   if (ListFormats) {
     listFormats();
   } else {
diff --git a/clang/tools/ssaf-linker/CMakeLists.txt b/clang/tools/clang-ssaf-linker/CMakeLists.txt
similarity index 100%
rename from clang/tools/ssaf-linker/CMakeLists.txt
rename to clang/tools/clang-ssaf-linker/CMakeLists.txt
diff --git a/clang/tools/ssaf-linker/SSAFLinker.cpp b/clang/tools/clang-ssaf-linker/SSAFLinker.cpp
similarity index 96%
rename from clang/tools/ssaf-linker/SSAFLinker.cpp
rename to clang/tools/clang-ssaf-linker/SSAFLinker.cpp
index e0b1cfdc02160..3c57a2fe2cf98 100644
--- a/clang/tools/ssaf-linker/SSAFLinker.cpp
+++ b/clang/tools/clang-ssaf-linker/SSAFLinker.cpp
@@ -1,4 +1,4 @@
-//===--- tools/ssaf-linker/SSAFLinker.cpp - SSAF Linker -------------------===//
+//===- SSAFLinker.cpp - SSAF Linker ---------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,9 +14,9 @@
 #include "clang/ScalableStaticAnalysisFramework/Core/EntityLinker/EntityLinker.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/EntityLinker/TUSummaryEncoding.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/Model/BuildNamespace.h"
-#include "clang/ScalableStaticAnalysisFramework/Core/Serialization/JSONFormat.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/Support/ErrorBuilder.h"
+#include "clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h" // IWYU pragma: keep
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
@@ -133,14 +133,11 @@ SerializationFormat *getFormatForExtension(llvm::StringRef Extension) {
     return It->second.get();
   }
 
-  // SerializationFormats are uppercase while file extensions are lowercase.
-  std::string CapitalizedExtension = Extension.upper();
-
-  if (!isFormatRegistered(CapitalizedExtension)) {
+  if (!isFormatRegistered(Extension)) {
     return nullptr;
   }
 
-  auto Format = makeFormat(CapitalizedExtension);
+  auto Format = makeFormat(Extension);
   SerializationFormat *Result = Format.get();
   assert(Result);
 
@@ -304,8 +301,6 @@ int main(int argc, const char **argv) {
   // Parse command-line arguments and exit with an error if they are invalid.
   cl::ParseCommandLineOptions(argc, argv, "SSAF Linker\n");
 
-  initializeJSONFormat();
-
   llvm::TimerGroup LinkerTimers(ToolName, "SSAF Linker");
   LinkerInput LI;
 
diff --git a/clang/unittests/AST/ByteCode/toAPValue.cpp b/clang/unittests/AST/ByteCode/toAPValue.cpp
index 3571dcc41ad27..702a07a638915 100644
--- a/clang/unittests/AST/ByteCode/toAPValue.cpp
+++ b/clang/unittests/AST/ByteCode/toAPValue.cpp
@@ -151,7 +151,8 @@ TEST(ToAPValue, FunctionPointers) {
     const ValueDecl *D = getDecl("nullp");
     ASSERT_NE(D, nullptr);
     const Pointer &GP = getGlobalPtr("nullp");
-    const auto &P = GP.deref<FunctionPointer>();
+    const auto &P = GP.deref<Pointer>();
+    ASSERT_TRUE(P.isZero());
     APValue A = P.toAPValue(ASTCtx);
     ASSERT_TRUE(A.isLValue());
     ASSERT_TRUE(A.getLValueBase().isNull());
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 511db573cdcf3..f13c68c3002e2 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -231,6 +231,9 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_NESTED_BOOL(AlignConsecutiveShortCaseStatements, AcrossComments);
   CHECK_PARSE_NESTED_BOOL(AlignConsecutiveShortCaseStatements, AlignCaseArrows);
   CHECK_PARSE_NESTED_BOOL(AlignConsecutiveShortCaseStatements, AlignCaseColons);
+  CHECK_PARSE_NESTED_BOOL(AllowShortFunctionsOnASingleLine, Empty);
+  CHECK_PARSE_NESTED_BOOL(AllowShortFunctionsOnASingleLine, Inline);
+  CHECK_PARSE_NESTED_BOOL(AllowShortFunctionsOnASingleLine, Other);
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, AfterCaseLabel);
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, AfterClass);
   CHECK_PARSE_NESTED_BOOL(BraceWrapping, AfterEnum);
@@ -685,20 +688,29 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("AllowShortBlocksOnASingleLine: true",
               AllowShortBlocksOnASingleLine, FormatStyle::SBS_Always);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
   CHECK_PARSE("AllowShortFunctionsOnASingleLine: None",
-              AllowShortFunctionsOnASingleLine, FormatStyle::SFS_None);
+              AllowShortFunctionsOnASingleLine,
+              FormatStyle::ShortFunctionStyle());
   CHECK_PARSE("AllowShortFunctionsOnASingleLine: Inline",
-              AllowShortFunctionsOnASingleLine, FormatStyle::SFS_Inline);
+              AllowShortFunctionsOnASingleLine,
+              FormatStyle::ShortFunctionStyle::setEmptyAndInline());
   CHECK_PARSE("AllowShortFunctionsOnASingleLine: Empty",
-              AllowShortFunctionsOnASingleLine, FormatStyle::SFS_Empty);
+              AllowShortFunctionsOnASingleLine,
+              FormatStyle::ShortFunctionStyle::setEmptyOnly());
   CHECK_PARSE("AllowShortFunctionsOnASingleLine: All",
-              AllowShortFunctionsOnASingleLine, FormatStyle::SFS_All);
+              AllowShortFunctionsOnASingleLine,
+              FormatStyle::ShortFunctionStyle::setAll());
+  CHECK_PARSE("AllowShortFunctionsOnASingleLine: InlineOnly",
+              AllowShortFunctionsOnASingleLine,
+              FormatStyle::ShortFunctionStyle::setInlineOnly());
+
   // For backward compatibility:
   CHECK_PARSE("AllowShortFunctionsOnASingleLine: false",
-              AllowShortFunctionsOnASingleLine, FormatStyle::SFS_None);
+              AllowShortFunctionsOnASingleLine,
+              FormatStyle::ShortFunctionStyle());
   CHECK_PARSE("AllowShortFunctionsOnASingleLine: true",
-              AllowShortFunctionsOnASingleLine, FormatStyle::SFS_All);
+              AllowShortFunctionsOnASingleLine,
+              FormatStyle::ShortFunctionStyle::setAll());
 
   Style.AllowShortLambdasOnASingleLine = FormatStyle::SLS_All;
   CHECK_PARSE("AllowShortLambdasOnASingleLine: None",
@@ -715,6 +727,16 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("AllowShortLambdasOnASingleLine: true",
               AllowShortLambdasOnASingleLine, FormatStyle::SLS_All);
 
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_EmptyAndAttached;
+  CHECK_PARSE("AllowShortRecordOnASingleLine: Never",
+              AllowShortRecordOnASingleLine, FormatStyle::SRS_Never);
+  CHECK_PARSE("AllowShortRecordOnASingleLine: EmptyAndAttached",
+              AllowShortRecordOnASingleLine, FormatStyle::SRS_EmptyAndAttached);
+  CHECK_PARSE("AllowShortRecordOnASingleLine: Empty",
+              AllowShortRecordOnASingleLine, FormatStyle::SRS_Empty);
+  CHECK_PARSE("AllowShortRecordOnASingleLine: Always",
+              AllowShortRecordOnASingleLine, FormatStyle::SRS_Always);
+
   Style.SpaceAroundPointerQualifiers = FormatStyle::SAPQ_Both;
   CHECK_PARSE("SpaceAroundPointerQualifiers: Default",
               SpaceAroundPointerQualifiers, FormatStyle::SAPQ_Default);
@@ -1021,13 +1043,6 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               StatementAttributeLikeMacros,
               std::vector<std::string>({"emit", "Q_EMIT"}));
 
-  Style.Macros.clear();
-  CHECK_PARSE("{Macros: [foo]}", Macros, std::vector<std::string>({"foo"}));
-  std::vector<std::string> GoogleMacros;
-  GoogleMacros.push_back("ASSIGN_OR_RETURN(a, b)=a = (b)");
-  GoogleMacros.push_back("ASSIGN_OR_RETURN(a, b, c)=a = (b); if (x) return c");
-  CHECK_PARSE("BasedOnStyle: Google", Macros, GoogleMacros);
-
   Style.StatementMacros.clear();
   CHECK_PARSE("StatementMacros: [QUNUSED]", StatementMacros,
               std::vector<std::string>{"QUNUSED"});
@@ -1035,6 +1050,7 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               std::vector<std::string>({"QUNUSED", "QT_REQUIRE_VERSION"}));
 
   CHECK_PARSE_LIST(JavaImportGroups);
+  CHECK_PARSE_LIST(Macros);
   CHECK_PARSE_LIST(MacrosSkippedByRemoveParentheses);
   CHECK_PARSE_LIST(NamespaceMacros);
   CHECK_PARSE_LIST(ObjCPropertyAttributeOrder);
diff --git a/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp b/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp
index 1f4245b703fb7..b1b4b1d047523 100644
--- a/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp
+++ b/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp
@@ -540,7 +540,7 @@ TEST_F(DefinitionBlockSeparatorTest, Leave) {
 TEST_F(DefinitionBlockSeparatorTest, CSharp) {
   FormatStyle Style = getLLVMStyle(FormatStyle::LK_CSharp);
   Style.SeparateDefinitionBlocks = FormatStyle::SDS_Always;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   Style.AllowShortEnumsOnASingleLine = false;
   verifyFormat("namespace {\r\n"
                "public class SomeTinyClass {\r\n"
@@ -586,7 +586,7 @@ TEST_F(DefinitionBlockSeparatorTest, CSharp) {
 TEST_F(DefinitionBlockSeparatorTest, JavaScript) {
   FormatStyle Style = getLLVMStyle(FormatStyle::LK_JavaScript);
   Style.SeparateDefinitionBlocks = FormatStyle::SDS_Always;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   Style.AllowShortEnumsOnASingleLine = false;
   verifyFormat("export const enum Foo {\n"
                "  A = 1,\n"
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index d066b3f482b21..4be9b3ea42930 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -388,7 +388,8 @@ TEST_F(FormatTest, RemovesEmptyLines) {
                "}  // namespace");
 
   FormatStyle Style = getLLVMStyle();
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   Style.MaxEmptyLinesToKeep = 2;
   Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterClass = true;
@@ -3514,13 +3515,14 @@ TEST_F(FormatTest, MultiLineControlStatements) {
   Style.BraceWrapping.AfterFunction = true;
   Style.BraceWrapping.AfterStruct = false;
   Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_MultiLine;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   Style.ColumnLimit = 80;
   verifyFormat("void shortfunction() { bar(); }", Style);
   verifyFormat("struct T shortfunction() { return bar(); }", Style);
   verifyFormat("struct T {};", Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   verifyFormat("void shortfunction()\n"
                "{\n"
                "  bar();\n"
@@ -3535,7 +3537,8 @@ TEST_F(FormatTest, MultiLineControlStatements) {
 
   Style.BraceWrapping.AfterFunction = false;
   Style.BraceWrapping.AfterStruct = true;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   verifyFormat("void shortfunction() { bar(); }", Style);
   verifyFormat("struct T shortfunction() { return bar(); }", Style);
   verifyFormat("struct T\n"
@@ -3543,7 +3546,7 @@ TEST_F(FormatTest, MultiLineControlStatements) {
                "};",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   verifyFormat("void shortfunction() {\n"
                "  bar();\n"
                "}",
@@ -4353,7 +4356,7 @@ TEST_F(FormatTest, FormatsNamespaces) {
   FormatStyle ShortInlineFunctions = getLLVMStyle();
   ShortInlineFunctions.NamespaceIndentation = FormatStyle::NI_All;
   ShortInlineFunctions.AllowShortFunctionsOnASingleLine =
-      FormatStyle::SFS_Inline;
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   verifyFormat("namespace {\n"
                "  void f() {\n"
                "    return;\n"
@@ -6698,6 +6701,13 @@ TEST_F(FormatTest, IndentPreprocessorDirectives) {
                "\n"
                "#define FGHIJK",
                Style);
+
+  verifyFormat("#ifndef FOO_H\n"
+               "#define FOO_H\n"
+               "#include <iostream>\n"
+               "#endif\n"
+               "// comment",
+               Style);
 }
 
 TEST_F(FormatTest, FormatAlignInsidePreprocessorElseBlock) {
@@ -8510,7 +8520,7 @@ TEST_F(FormatTest, BreakConstructorInitializersAfterColon) {
                "};",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   verifyNoChange("SomeClass::Constructor() :\n"
                  "    a(a), b(b), c(c) {\n"
                  "}",
@@ -8521,7 +8531,8 @@ TEST_F(FormatTest, BreakConstructorInitializersAfterColon) {
                  Style);
 
   Style.ColumnLimit = 80;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   Style.ConstructorInitializerIndentWidth = 2;
   verifyFormat("SomeClass::Constructor() : a(a), b(b), c(c) {}", Style);
   verifyFormat("SomeClass::Constructor() :\n"
@@ -12614,7 +12625,8 @@ TEST_F(FormatTest, UnderstandsAttributes) {
   verifyFormat("__attr1(nodebug) ::qualified_type f();", CustomAttrs);
 
   // Check that these are not parsed as function declarations:
-  CustomAttrs.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  CustomAttrs.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle();
   CustomAttrs.BreakBeforeBraces = FormatStyle::BS_Allman;
   verifyFormat("SomeType s(InitValue);", CustomAttrs);
   verifyFormat("SomeType s{InitValue};", CustomAttrs);
@@ -12722,7 +12734,8 @@ TEST_F(FormatTest, UnderstandsSquareAttributes) {
 
   // Make sure we do not parse attributes as lambda introducers.
   FormatStyle MultiLineFunctions = getLLVMStyle();
-  MultiLineFunctions.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  MultiLineFunctions.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle();
   verifyFormat("[[unused]] int b() {\n"
                "  return 42;\n"
                "}",
@@ -14826,7 +14839,8 @@ TEST_F(FormatTest, FormatsBracedListsInColumnLayout) {
 
 TEST_F(FormatTest, PullTrivialFunctionDefinitionsIntoSingleLine) {
   FormatStyle DoNotMerge = getLLVMStyle();
-  DoNotMerge.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  DoNotMerge.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle();
 
   verifyFormat("void f() { return 42; }");
   verifyFormat("void f() {\n"
@@ -14897,7 +14911,7 @@ TEST_F(FormatTest, PullTrivialFunctionDefinitionsIntoSingleLine) {
 
   FormatStyle DoNotMergeNoColumnLimit = NoColumnLimit;
   DoNotMergeNoColumnLimit.AllowShortFunctionsOnASingleLine =
-      FormatStyle::SFS_None;
+      FormatStyle::ShortFunctionStyle();
   verifyFormat("A() : b(0) {\n"
                "}",
                DoNotMergeNoColumnLimit);
@@ -14945,7 +14959,8 @@ TEST_F(FormatTest, PullTrivialFunctionDefinitionsIntoSingleLine) {
 
 TEST_F(FormatTest, PullEmptyFunctionDefinitionsIntoSingleLine) {
   FormatStyle MergeEmptyOnly = getLLVMStyle();
-  MergeEmptyOnly.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
+  MergeEmptyOnly.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyOnly();
   verifyFormat("class C {\n"
                "  int f() {}\n"
                "};",
@@ -14974,7 +14989,8 @@ TEST_F(FormatTest, PullEmptyFunctionDefinitionsIntoSingleLine) {
 
 TEST_F(FormatTest, PullInlineFunctionDefinitionsIntoSingleLine) {
   FormatStyle MergeInlineOnly = getLLVMStyle();
-  MergeInlineOnly.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+  MergeInlineOnly.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   verifyFormat("class C {\n"
                "  int f() { return 42; }\n"
                "};",
@@ -15083,10 +15099,116 @@ TEST_F(FormatTest, PullInlineFunctionDefinitionsIntoSingleLine) {
                MergeInlineOnly);
 }
 
+TEST_F(FormatTest, CustomShortFunctionOptions) {
+  FormatStyle CustomEmpty = getLLVMStyle();
+  CustomEmpty.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyOnly();
+
+  // Empty functions should be on a single line
+  verifyFormat("int f() {}", CustomEmpty);
+  verifyFormat("class C {\n"
+               "  int f() {}\n"
+               "};",
+               CustomEmpty);
+
+  // Non-empty functions should be multi-line
+  verifyFormat("int f() {\n"
+               "  return 42;\n"
+               "}",
+               CustomEmpty);
+  verifyFormat("class C {\n"
+               "  int f() {\n"
+               "    return 42;\n"
+               "  }\n"
+               "};",
+               CustomEmpty);
+
+  // test with comment
+  verifyFormat("void f3() { /* comment */ }", CustomEmpty);
+
+  // Test with AfterFunction = true
+  CustomEmpty.BreakBeforeBraces = FormatStyle::BS_Custom;
+  CustomEmpty.BraceWrapping.AfterFunction = true;
+  verifyFormat("int f() {}", CustomEmpty);
+  verifyFormat("int g()\n"
+               "{\n"
+               "  return 42;\n"
+               "}",
+               CustomEmpty);
+
+  // Test with Inline = true, All = false
+  FormatStyle CustomInline = getLLVMStyle();
+  CustomInline.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setInlineOnly();
+
+  verifyFormat("class C {\n"
+               "  int f() {}\n"
+               "};",
+               CustomInline);
+
+  // Non-empty inline functions should be single-line
+  verifyFormat("class C {\n"
+               "  int f() { return 42; }\n"
+               "};",
+               CustomInline);
+
+  // Non-inline functions should be multi-line
+  verifyFormat("int f() {\n"
+               "  return 42;\n"
+               "}",
+               CustomInline);
+  verifyFormat("int g() {\n"
+               "}",
+               CustomInline);
+
+  // Test with All = true
+  FormatStyle CustomAll = getLLVMStyle();
+  CustomAll.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
+
+  // All functions should be on a single line if they fit
+  verifyFormat("int f() { return 42; }", CustomAll);
+  verifyFormat("int g() { return f() + h(); }", CustomAll);
+  verifyFormat("class C {\n"
+               "  int f() { return 42; }\n"
+               "};",
+               CustomAll);
+
+  verifyFormat("int f() {}", CustomAll);
+  verifyFormat("class C {\n"
+               "  int f() {}\n"
+               "};",
+               CustomAll);
+
+  // Test various combinations
+  FormatStyle CustomMixed = getLLVMStyle();
+  CustomMixed.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
+
+  // Empty functions should be on a single line
+  verifyFormat("int f() {}", CustomMixed);
+  verifyFormat("class C {\n"
+               "  int f() {}\n"
+               "};",
+               CustomMixed);
+
+  // Inline non-empty functions should be on a single line
+  verifyFormat("class C {\n"
+               "  int f() { return 42; }\n"
+               "};",
+               CustomMixed);
+
+  // Non-inline non-empty functions should be multi-line
+  verifyFormat("int f() {\n"
+               "  return 42;\n"
+               "}",
+               CustomMixed);
+}
+
 TEST_F(FormatTest, PullInlineOnlyFunctionDefinitionsIntoSingleLine) {
   FormatStyle MergeInlineOnly = getLLVMStyle();
   MergeInlineOnly.AllowShortFunctionsOnASingleLine =
-      FormatStyle::SFS_InlineOnly;
+      FormatStyle::ShortFunctionStyle::setInlineOnly();
   verifyFormat("class C {\n"
                "  int f() { return 42; }\n"
                "};",
@@ -15138,7 +15260,7 @@ TEST_F(FormatTest, PullInlineOnlyFunctionDefinitionsIntoSingleLine) {
 
 TEST_F(FormatTest, SplitEmptyFunction) {
   FormatStyle Style = getLLVMStyleWithColumns(40);
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterFunction = true;
   Style.BraceWrapping.SplitEmptyFunction = false;
@@ -15157,7 +15279,8 @@ TEST_F(FormatTest, SplitEmptyFunction) {
                "}",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyOnly();
   verifyFormat("int f() {}", Style);
   verifyFormat("int aaaaaaaaaaaaaa(int bbbbbbbbbbbbbb)\n"
                "{}",
@@ -15168,7 +15291,8 @@ TEST_F(FormatTest, SplitEmptyFunction) {
                "}",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   verifyFormat("class Foo {\n"
                "  int f() {}\n"
                "};",
@@ -15177,6 +15301,10 @@ TEST_F(FormatTest, SplitEmptyFunction) {
                "  int f() { return 0; }\n"
                "};",
                Style);
+  verifyFormat("class Foo {\n"
+               "  int f() { return 0; }\n"
+               "};",
+               Style);
   verifyFormat("class Foo {\n"
                "  int aaaaaaaaaaaaaa(int bbbbbbbbbbbbbb)\n"
                "  {}\n"
@@ -15190,7 +15318,8 @@ TEST_F(FormatTest, SplitEmptyFunction) {
                "};",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   verifyFormat("int f() {}", Style);
   verifyFormat("int f() { return 0; }", Style);
   verifyFormat("int aaaaaaaaaaaaaa(int bbbbbbbbbbbbbb)\n"
@@ -15205,7 +15334,7 @@ TEST_F(FormatTest, SplitEmptyFunction) {
 
 TEST_F(FormatTest, SplitEmptyFunctionButNotRecord) {
   FormatStyle Style = getLLVMStyleWithColumns(40);
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterFunction = true;
   Style.BraceWrapping.SplitEmptyFunction = true;
@@ -15235,7 +15364,7 @@ TEST_F(FormatTest, SplitEmptyFunctionButNotRecord) {
 
 TEST_F(FormatTest, MergeShortFunctionBody) {
   auto Style = getLLVMStyle();
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Always;
   Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterFunction = true;
@@ -15247,7 +15376,8 @@ TEST_F(FormatTest, MergeShortFunctionBody) {
 
 TEST_F(FormatTest, KeepShortFunctionAfterPPElse) {
   FormatStyle Style = getLLVMStyle();
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   verifyFormat("#ifdef A\n"
                "int f() {}\n"
                "#else\n"
@@ -15514,6 +15644,101 @@ TEST_F(FormatTest, NeverMergeShortRecords) {
                Style);
 }
 
+TEST_F(FormatTest, AllowShortRecordOnASingleLine) {
+  auto Style = getLLVMStyle();
+  EXPECT_EQ(Style.AllowShortRecordOnASingleLine,
+            FormatStyle::SRS_EmptyAndAttached);
+
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_Never;
+  verifyFormat("class foo {\n"
+               "};\n"
+               "class bar {\n"
+               "  int i;\n"
+               "};",
+               Style);
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+  verifyFormat("class foo\n"
+               "{\n"
+               "};\n"
+               "class bar\n"
+               "{\n"
+               "  int i;\n"
+               "};",
+               Style);
+  Style.BraceWrapping.SplitEmptyRecord = false;
+  verifyFormat("class foo\n"
+               "{};",
+               Style);
+
+  Style = getLLVMStyle();
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_Empty;
+  verifyFormat("class foo {};\n"
+               "class bar {\n"
+               "  int i;\n"
+               "};",
+               Style);
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+  verifyFormat("class foo\n"
+               "{\n"
+               "};\n"
+               "class bar\n"
+               "{\n"
+               "  int i;\n"
+               "};",
+               Style);
+  Style.BraceWrapping.SplitEmptyRecord = false;
+  verifyFormat("class foo {};", Style);
+
+  Style = getLLVMStyle();
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_Always;
+  verifyFormat("class foo {};\n"
+               "class bar { int i; };",
+               Style);
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+  verifyFormat("class foo\n"
+               "{\n"
+               "};\n"
+               "class bar { int i; };",
+               Style);
+  Style.BraceWrapping.SplitEmptyRecord = false;
+  verifyFormat("class foo {};", Style);
+
+  Style = getLLVMStyle();
+  Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Always;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_Never;
+  verifyFormat("class foo\n"
+               "{ int i; };",
+               Style);
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_Empty;
+  verifyFormat("class foo\n"
+               "{ int i; };",
+               Style);
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_Always;
+  verifyFormat("class foo\n"
+               "{\n"
+               "};\n"
+               "class foo { int i; };",
+               Style);
+
+  Style = getLLVMStyle();
+  Style.BraceWrapping.SplitEmptyRecord = false;
+  Style.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Style.BraceWrapping.AfterClass = true;
+  Style.AllowShortRecordOnASingleLine = FormatStyle::SRS_Always;
+  verifyFormat("class foo\n"
+               "{\n"
+               "  int i;\n"
+               "  int j;\n"
+               "};",
+               Style);
+}
+
 TEST_F(FormatTest, UnderstandContextOfRecordTypeKeywords) {
   // Elaborate type variable declarations.
   verifyFormat("struct foo a = {bar};\nint n;");
@@ -17125,6 +17350,16 @@ TEST_F(FormatTest, ConfigurableUseOfTab) {
   verifyFormat("int aaaaaaaaaa = bbbbbbbbbbbbbbbbbbbb\n"
                "               + cccccccccccccccccccc;",
                Tab);
+
+  Tab.BreakBeforeBraces = FormatStyle::BS_Custom;
+  Tab.BraceWrapping.BeforeLambdaBody = true;
+  verifyNoChange("example(\n"
+                 "\t[]\n"
+                 "\t{\n"
+                 "\t\t// foo\n"
+                 "\t\t// bar\n"
+                 "\t});",
+                 Tab);
 }
 
 TEST_F(FormatTest, ZeroTabWidth) {
@@ -21603,7 +21838,7 @@ TEST_F(FormatTest, WhitesmithsBraceBreaking) {
 
   // Make a few changes to the style for testing purposes
   WhitesmithsBraceStyle.AllowShortFunctionsOnASingleLine =
-      FormatStyle::SFS_Empty;
+      FormatStyle::ShortFunctionStyle::setEmptyOnly();
   WhitesmithsBraceStyle.AllowShortLambdasOnASingleLine = FormatStyle::SLS_None;
 
   // FIXME: this test case can't decide whether there should be a blank line
@@ -22694,6 +22929,68 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) {
                Style);
 }
 
+TEST_F(FormatTest, AlignArrayOfStructuresGithubIssues) {
+  // https://github.com/llvm/llvm-project/issues/138151
+  // Summary: Aligning arrays of structures with UseTab: AlignWithSpaces does
+  // not use spaces to align columns
+  FormatStyle Style = getGoogleStyle();
+  Style.AlignArrayOfStructures = FormatStyle::AIAS_Left;
+  Style.UseTab = FormatStyle::UT_AlignWithSpaces;
+  Style.IndentWidth = 4;
+  Style.TabWidth = 4;
+
+  verifyFormat(
+      "std::vector<Foo> foos = {\n"
+      "\t{LONG_NAME,                0,                        i | j},\n"
+      "\t{LONG_NAME,                0,                        i | j},\n"
+      "\t{LONGER_NAME,              0,                        i | j},\n"
+      "\t{LONGER_NAME,              0,                        i    },\n"
+      "\t{THIS_IS_A_VERY_LONG_NAME, 0,                        j    },\n"
+      "\t{LONGER_NAME,              THIS_IS_A_VERY_LONG_NAME, i    },\n"
+      "\t{LONG_NAME,                THIS_IS_A_VERY_LONG_NAME, j    }\n"
+      "};\n",
+      Style);
+
+  // https://github.com/llvm/llvm-project/issues/85937
+  // Summary: Macro escaped newlines are not aligned properly when both
+  // AlignEscapedNewLines and AlignArrayOfStructures are used
+  Style = getLLVMStyleWithColumns(80);
+  Style.AlignEscapedNewlines = FormatStyle::ENAS_Left;
+  Style.AlignArrayOfStructures = FormatStyle::AIAS_Left;
+
+  verifyFormat(R"(
+#define DEFINE_COMMAND_PROCESS_TABLE(Enum)      \
+  const STExample TCommand::EXPL_MAIN[] = {     \
+      {Enum::GetName(),      " shows help "  }, \
+      {Enum::GetAttribute(), " do something "}, \
+      {Enum::GetState(),     " do whatever " }, \
+  };
+)",
+               Style);
+
+  // https://github.com/llvm/llvm-project/issues/53442
+  // Summary: alignment of columns does not use spaces when UseTab:
+  // AlignWithSpaces
+  Style = getLLVMStyle();
+  Style.AlignArrayOfStructures = FormatStyle::AIAS_Left;
+  Style.IndentWidth = 4;
+  Style.TabWidth = 4;
+  Style.UseTab = FormatStyle::UT_AlignWithSpaces;
+  Style.BreakBeforeBraces = FormatStyle::BS_Allman;
+
+  verifyFormat(
+      "const map<string, int64_t> CoreReport::GetGameCountersRolloverInfo()\n"
+      "{\n"
+      "\tstatic map<string, int64_t> counterRolloverInfo{\n"
+      "\t\t{\"CashIn\",                   4000000000},\n"
+      "\t\t{\"CoinIn\",                   4000000000},\n"
+      "\t\t{\"QuantityMultiProgressive\", 65535     },\n"
+      "\t};\n"
+      "\treturn counterRolloverInfo;\n"
+      "}\n",
+      Style);
+}
+
 TEST_F(FormatTest, UnderstandsPragmas) {
   verifyFormat("#pragma omp reduction(| : var)");
   verifyFormat("#pragma omp reduction(+ : var)");
@@ -23161,7 +23458,7 @@ TEST_F(FormatTest, BreakConstructorInitializersBeforeComma) {
                "}",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   verifyFormat("SomeClass::Constructor()\n"
                "    : a(a)\n"
                "    , b(b)\n"
@@ -23172,7 +23469,8 @@ TEST_F(FormatTest, BreakConstructorInitializersBeforeComma) {
                Style);
 
   Style.ColumnLimit = 80;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   Style.ConstructorInitializerIndentWidth = 2;
   verifyFormat("SomeClass::Constructor()\n"
                "  : a(a)\n"
@@ -27533,7 +27831,7 @@ TEST_F(FormatTest, FormatDecayCopy) {
 TEST_F(FormatTest, Cpp20ModulesSupport) {
   FormatStyle Style = getLLVMStyle();
   Style.AllowShortBlocksOnASingleLine = FormatStyle::SBS_Never;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
 
   verifyFormat("export import foo;", Style);
   verifyFormat("export import foo:bar;", Style);
@@ -29352,6 +29650,14 @@ TEST_F(FormatTest, UnbalancedAngleBrackets) {
 
   verifyNoCrash("typename foo<bar>::value, const String &>::type f();",
                 getLLVMStyleWithColumns(50));
+
+  verifyNoCrash(
+      ">\n"
+      " f({\n"
+      "   {}inner> () __attribute __attribute__((foo())) int foo(void)\n"
+      "   {};\n"
+      "   }, );",
+      getLLVMStyleWithColumns(70));
 }
 
 TEST_F(FormatTest, LambdaArrowAsTrailingReturnArrow) {
diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp
index 042e223b9ee84..805fe6f9bd3c2 100644
--- a/clang/unittests/Format/FormatTestCSharp.cpp
+++ b/clang/unittests/Format/FormatTestCSharp.cpp
@@ -1673,7 +1673,8 @@ TEST_F(FormatTestCSharp, EmptyShortBlock) {
 TEST_F(FormatTestCSharp, ShortFunctions) {
   FormatStyle Style = getLLVMStyle(FormatStyle::LK_CSharp);
   Style.NamespaceIndentation = FormatStyle::NI_All;
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   verifyFormat("interface Interface {\n"
                "  void f() { return; }\n"
                "};",
diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
index 4847151c14b33..b405a73bce086 100644
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -1019,7 +1019,8 @@ TEST_F(FormatTestJS, TrailingCommaInsertion) {
 
 TEST_F(FormatTestJS, FunctionLiterals) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_JavaScript);
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   verifyFormat("doFoo(function() {});");
   verifyFormat("doFoo(function() { return 1; });", Style);
   verifyFormat("var func = function() {\n"
@@ -1132,7 +1133,8 @@ TEST_F(FormatTestJS, DontWrapEmptyLiterals) {
 
 TEST_F(FormatTestJS, InliningFunctionLiterals) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_JavaScript);
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   verifyFormat("var func = function() {\n"
                "  return 1;\n"
                "};",
@@ -1147,7 +1149,8 @@ TEST_F(FormatTestJS, InliningFunctionLiterals) {
                "}",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   verifyFormat("var func = function() { return 1; };", Style);
   verifyFormat("var func = doSomething(function() { return 1; });", Style);
   verifyFormat(
@@ -1158,7 +1161,7 @@ TEST_F(FormatTestJS, InliningFunctionLiterals) {
                "}",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_None;
+  Style.AllowShortFunctionsOnASingleLine = FormatStyle::ShortFunctionStyle();
   verifyFormat("var func = function() {\n"
                "  return 1;\n"
                "};",
@@ -1180,7 +1183,8 @@ TEST_F(FormatTestJS, InliningFunctionLiterals) {
                "}",
                Style);
 
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Empty;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyOnly();
   verifyFormat("var func = function() {\n"
                "  return 1;\n"
                "};",
@@ -1189,7 +1193,8 @@ TEST_F(FormatTestJS, InliningFunctionLiterals) {
 
 TEST_F(FormatTestJS, MultipleFunctionLiterals) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_JavaScript);
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_All;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setAll();
   verifyFormat("promise.then(\n"
                "    function success() {\n"
                "      doFoo();\n"
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index 29890aa863569..fa51e0421d714 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -596,7 +596,8 @@ TEST_F(FormatTestJava, RetainsLogicalShifts) {
 
 TEST_F(FormatTestJava, ShortFunctions) {
   FormatStyle Style = getLLVMStyle(FormatStyle::LK_Java);
-  Style.AllowShortFunctionsOnASingleLine = FormatStyle::SFS_Inline;
+  Style.AllowShortFunctionsOnASingleLine =
+      FormatStyle::ShortFunctionStyle::setEmptyAndInline();
   verifyFormat("enum Enum {\n"
                "  E1,\n"
                "  E2;\n"
diff --git a/clang/unittests/Format/FormatTestMacroExpansion.cpp b/clang/unittests/Format/FormatTestMacroExpansion.cpp
index c00607f0b7115..d391fe3d715c3 100644
--- a/clang/unittests/Format/FormatTestMacroExpansion.cpp
+++ b/clang/unittests/Format/FormatTestMacroExpansion.cpp
@@ -58,18 +58,10 @@ TEST_F(FormatTestMacroExpansion, UnexpandConfiguredMacros) {
   verifyFormat("ASSIGN_OR_RETURN(MySomewhatLongType *variable,\n"
                "                 MySomewhatLongFunction(SomethingElse()));",
                Style);
-  verifyFormat(
-      "ASSIGN_OR_RETURN(MySomewhatLongType *variable,\n"
-      "                 MySomewhatLongFunction(SomethingElse()), RetMe());",
-      Style);
-
-  verifyFormat(
-      "void f() {\n"
-      "  ASSIGN_OR_RETURN(MySomewhatLongType* variable,\n"
-      "                   MySomewhatLongFunction(SomethingElse()));\n"
-      "  ASSIGN_OR_RETURN(MySomewhatLongType* variable,\n"
-      "                   MySomewhatLongFunction(SomethingElse()), RetMe());",
-      getGoogleStyle());
+  verifyFormat("ASSIGN_OR_RETURN(MySomewhatLongType *variable,\n"
+               "                 MySomewhatLongFunction(SomethingElse()), "
+               "ReturnMe());",
+               Style);
 
   verifyFormat(R"(
 #define MACRO(a, b) ID(a + b)
diff --git a/clang/unittests/Format/SortImportsTestJava.cpp b/clang/unittests/Format/SortImportsTestJava.cpp
index 26674c75e97b1..4e7111e7e7dff 100644
--- a/clang/unittests/Format/SortImportsTestJava.cpp
+++ b/clang/unittests/Format/SortImportsTestJava.cpp
@@ -349,6 +349,37 @@ TEST_F(SortImportsTestJava, NoReplacementsForValidImportsWindows) {
       sortIncludes(FmtStyle, Code, GetCodeRange(Code), "input.java").empty());
 }
 
+TEST_F(SortImportsTestJava, DoNotSortImportsInBlockComment) {
+  constexpr StringRef Code("/* import org.d;\n"
+                           "import org.c;\n"
+                           "import org.b; */\n"
+                           "import org.a;");
+  EXPECT_EQ(Code, sort(Code));
+}
+
+TEST_F(SortImportsTestJava, StopAtClassDeclaration) {
+  constexpr StringRef Code("import org.a;\n"
+                           "\n"
+                           "class Foo {\n"
+                           "  String code = \"\"\"\n"
+                           "      import org.c;\n"
+                           "      import org.b;\n"
+                           "  \"\"\";\n"
+                           "}");
+  EXPECT_EQ(Code, sort(Code));
+}
+
+TEST_F(SortImportsTestJava, SortImportsAfterPackageStatement) {
+  EXPECT_EQ("package org.a;\n"
+            "\n"
+            "import org.a;\n"
+            "import org.b;",
+            sort("package org.a;\n"
+                 "\n"
+                 "import org.b;\n"
+                 "import org.a;"));
+}
+
 } // end namespace
 } // end namespace format
 } // end namespace clang
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index defd78aedfd70..4081b9c9b4994 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -416,6 +416,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[16], tok::star, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[22], tok::star, TT_BinaryOperator);
 
+  Tokens = annotate("Foo foo{bar, bar * bar};");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::star, TT_BinaryOperator);
+  // Not TT_StartOfName.
+  EXPECT_TOKEN(Tokens[7], tok::identifier, TT_Unknown);
+
   Tokens = annotate("NSError *__autoreleasing *foo;",
                     getLLVMStyle(FormatStyle::LK_ObjC));
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt b/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt
index 6a1751182a2a4..871d9e6b0c02c 100644
--- a/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt
+++ b/clang/unittests/ScalableStaticAnalysisFramework/CMakeLists.txt
@@ -8,6 +8,7 @@ add_distinct_clang_unittest(ClangScalableAnalysisTests
   EntityLinkerTest.cpp
   EntityNameTest.cpp
   ErrorBuilderTest.cpp
+  Frontend/TUSummaryExtractorFrontendActionTest.cpp
   ModelStringConversionsTest.cpp
   Registries/FancyAnalysisData.cpp
   Registries/MockSerializationFormat.cpp
@@ -29,6 +30,7 @@ add_distinct_clang_unittest(ClangScalableAnalysisTests
   clangBasic
   clangFrontend
   clangScalableStaticAnalysisFrameworkCore
+  clangScalableStaticAnalysisFrameworkFrontend
   clangSerialization
   clangTooling
 
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendActionTest.cpp b/clang/unittests/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendActionTest.cpp
new file mode 100644
index 0000000000000..d684366ed53ce
--- /dev/null
+++ b/clang/unittests/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendActionTest.cpp
@@ -0,0 +1,366 @@
+//===- TUSummaryExtractorFrontendActionTest.cpp ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/ScalableStaticAnalysisFramework/Frontend/TUSummaryExtractorFrontendAction.h"
+#include "clang/AST/ASTConsumer.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/FrontendOptions.h"
+#include "clang/Frontend/TextDiagnosticBuffer.h"
+#include "clang/Lex/PreprocessorOptions.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormat.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/Serialization/SerializationFormatRegistry.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/ExtractorRegistry.h"
+#include "clang/ScalableStaticAnalysisFramework/Core/TUSummary/TUSummaryExtractor.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace clang;
+using namespace ssaf;
+using ::testing::Contains;
+using ::testing::UnorderedElementsAre;
+
+static auto errorsMsgsOf(const TextDiagnosticBuffer &Diags) {
+  auto Errors = llvm::make_range(Diags.err_begin(), Diags.err_end());
+  return llvm::make_second_range(Errors);
+}
+namespace {
+
+/// A no-op TUSummaryExtractor suitable for use with a real TUSummaryBuilder.
+class NoOpExtractor : public TUSummaryExtractor {
+public:
+  using TUSummaryExtractor::TUSummaryExtractor;
+  void HandleTranslationUnit(ASTContext &Ctx) override {}
+};
+} // namespace
+
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+volatile int SSAFNoOpExtractorAnchorSource = 0;
+static TUSummaryExtractorRegistry::Add<NoOpExtractor>
+    RegisterNoOp("NoOpExtractor", "No-op extractor for frontend action tests");
+
+namespace {
+class FailingSerializationFormat final : public SerializationFormat {
+public:
+  static llvm::Error failing(llvm::StringRef Component) {
+    return llvm::createStringError(
+        "error from always failing serialization format: " + Component);
+  }
+
+  llvm::Expected<TUSummary> readTUSummary(llvm::StringRef Path) override {
+    return failing("readTUSummary");
+  }
+
+  llvm::Error writeTUSummary(const TUSummary &Summary,
+                             llvm::StringRef Path) override {
+    return failing("writeTUSummary");
+  }
+
+  llvm::Expected<TUSummaryEncoding>
+  readTUSummaryEncoding(llvm::StringRef Path) override {
+    return failing("readTUSummaryEncoding");
+  }
+
+  llvm::Error writeTUSummaryEncoding(const TUSummaryEncoding &SummaryEncoding,
+                                     llvm::StringRef Path) override {
+    return failing("writeTUSummaryEncoding");
+  }
+
+  llvm::Expected<LUSummary> readLUSummary(llvm::StringRef Path) override {
+    return failing("readLUSummary");
+  }
+
+  llvm::Error writeLUSummary(const LUSummary &Summary,
+                             llvm::StringRef Path) override {
+    return failing("writeLUSummary");
+  }
+
+  llvm::Expected<LUSummaryEncoding>
+  readLUSummaryEncoding(llvm::StringRef Path) override {
+    return failing("readLUSummaryEncoding");
+  }
+
+  llvm::Error writeLUSummaryEncoding(const LUSummaryEncoding &SummaryEncoding,
+                                     llvm::StringRef Path) override {
+    return failing("writeLUSummaryEncoding");
+  }
+
+  void forEachRegisteredAnalysis(
+      llvm::function_ref<void(llvm::StringRef Name, llvm::StringRef Desc)>
+          Callback) const override {}
+};
+} // namespace
+
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+volatile int SSAFFailingSerializationFormatAnchorSource = 0;
+static SerializationFormatRegistry::Add<FailingSerializationFormat>
+    RegisterFormat(
+        "FailingSerializationFormat",
+        "A serialization format that fails on every possible operation.");
+
+using EventLog = std::vector<std::string>;
+
+namespace {
+
+/// An ASTConsumer that logs callback invocations into a shared log.
+class RecordingASTConsumer : public ASTConsumer {
+public:
+  RecordingASTConsumer(EventLog &Log, std::string Tag)
+      : Log(Log), Tag(std::move(Tag)) {}
+
+  void Initialize(ASTContext &Ctx) override {
+    Log.push_back(Tag + "::Initialize");
+  }
+  bool HandleTopLevelDecl(DeclGroupRef D) override {
+    Log.push_back(Tag + "::HandleTopLevelDecl");
+    return true;
+  }
+  void HandleTranslationUnit(ASTContext &Ctx) override {
+    Log.push_back(Tag + "::HandleTranslationUnit");
+  }
+
+private:
+  EventLog &Log;
+  std::string Tag;
+};
+
+/// A FrontendAction that returns a RecordingASTConsumer with the tag "Wrapped".
+class RecordingAction : public ASTFrontendAction {
+public:
+  EventLog &getLog() { return Log; }
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &,
+                                                 StringRef) override {
+    return std::make_unique<RecordingASTConsumer>(Log, /*Tag=*/"Wrapped");
+  }
+
+private:
+  EventLog Log;
+};
+
+class FailingAction : public ASTFrontendAction {
+public:
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &,
+                                                 StringRef) override {
+    return nullptr;
+  }
+};
+
+/// Creates a CompilerInstance configured with an in-memory "test.cc" file
+/// containing "int x = 42;".
+static std::unique_ptr<CompilerInstance>
+makeCompiler(TextDiagnosticBuffer &DiagBuf) {
+  auto Invocation = std::make_shared<CompilerInvocation>();
+  Invocation->getPreprocessorOpts().addRemappedFile(
+      "test.cc", llvm::MemoryBuffer::getMemBuffer("int x = 42;").release());
+  Invocation->getFrontendOpts().Inputs.push_back(
+      FrontendInputFile("test.cc", Language::CXX));
+  Invocation->getFrontendOpts().ProgramAction = frontend::ParseSyntaxOnly;
+  Invocation->getTargetOpts().Triple = "i386-unknown-linux-gnu";
+  auto Compiler = std::make_unique<CompilerInstance>(std::move(Invocation));
+  Compiler->setVirtualFileSystem(llvm::vfs::getRealFileSystem());
+  Compiler->createDiagnostics(&DiagBuf, /*ShouldOwnClient=*/false);
+  return Compiler;
+}
+
+struct TUSummaryExtractorFrontendActionTest : testing::Test {
+  using PathString = llvm::SmallString<128>;
+  PathString TestDir;
+  TextDiagnosticBuffer DiagBuf;
+  std::unique_ptr<CompilerInstance> Compiler = makeCompiler(DiagBuf);
+
+  void SetUp() override {
+    std::error_code EC = llvm::sys::fs::createUniqueDirectory(
+        "ssaf-frontend-action-test", TestDir);
+    ASSERT_FALSE(EC) << "Failed to create temp directory: " << EC.message();
+  }
+
+  void TearDown() override { llvm::sys::fs::remove_directories(TestDir); }
+
+  std::string makePath(llvm::StringRef FileOrDirectoryName) const {
+    PathString FullPath = TestDir;
+    llvm::sys::path::append(FullPath, FileOrDirectoryName);
+    return FullPath.str().str();
+  }
+};
+
+TEST_F(TUSummaryExtractorFrontendActionTest,
+       WrappedActionFailsToCreateConsumer) {
+  // Configure valid SSAF options so the failure is purely from the wrapped
+  // action, not from runner creation.
+  std::string Output = makePath("output.MockSerializationFormat");
+  Compiler->getFrontendOpts().SSAFTUSummaryFile = Output;
+  Compiler->getFrontendOpts().SSAFExtractSummaries = {"NoOpExtractor"};
+
+  TUSummaryExtractorFrontendAction ExtractorAction(
+      std::make_unique<FailingAction>());
+  Compiler->ExecuteAction(ExtractorAction);
+
+  // If the wrapped action fails, the ExtractorAction should not output.
+  EXPECT_FALSE(llvm::sys::fs::exists(Output));
+}
+
+TEST_F(TUSummaryExtractorFrontendActionTest,
+       RunnerFailsWithInvalidFormat_WrappedConsumerStillRuns) {
+  // Use an unregistered format extension so TUSummaryRunner::create fails.
+  std::string Output = makePath("output.xyz");
+  Compiler->getFrontendOpts().SSAFTUSummaryFile = Output;
+  Compiler->getFrontendOpts().SSAFExtractSummaries = {"NoOpExtractor"};
+
+  auto Wrapped = std::make_unique<RecordingAction>();
+  const EventLog &Log = Wrapped->getLog();
+  TUSummaryExtractorFrontendAction ExtractorAction(std::move(Wrapped));
+
+  // The runner fails, so ExecuteAction should return false due to the fatal
+  // diagnostic.
+  EXPECT_FALSE(Compiler->ExecuteAction(ExtractorAction));
+
+  // The wrapped consumer should still have run.
+  EXPECT_THAT(Log, Contains("Wrapped::Initialize"));
+  EXPECT_THAT(Log, Contains("Wrapped::HandleTranslationUnit"));
+
+  // Exactly one error about the unknown format.
+  EXPECT_THAT(errorsMsgsOf(DiagBuf),
+              UnorderedElementsAre(
+                  "unknown output summary file format 'xyz' specified by "
+                  "'--ssaf-tu-summary-file=" +
+                  Output + "'"));
+
+  // No output should have been created due to the failure.
+  EXPECT_FALSE(llvm::sys::fs::exists(Output));
+}
+
+TEST_F(TUSummaryExtractorFrontendActionTest,
+       RunnerFailsWithUnknownExtractor_WrappedConsumerStillRuns) {
+  std::string Output = makePath("output.MockSerializationFormat");
+  Compiler->getFrontendOpts().SSAFTUSummaryFile = Output;
+  Compiler->getFrontendOpts().SSAFExtractSummaries = {"NonExistentExtractor"};
+
+  auto Wrapped = std::make_unique<RecordingAction>();
+  const EventLog &Log = Wrapped->getLog();
+  TUSummaryExtractorFrontendAction ExtractorAction(std::move(Wrapped));
+  EXPECT_FALSE(Compiler->ExecuteAction(ExtractorAction));
+
+  // The wrapped consumer should still have run.
+  EXPECT_THAT(Log, Contains("Wrapped::Initialize"));
+  EXPECT_THAT(Log, Contains("Wrapped::HandleTranslationUnit"));
+
+  // Exactly one error about the unknown extractor.
+  EXPECT_THAT(errorsMsgsOf(DiagBuf),
+              UnorderedElementsAre("no summary extractor was registered with "
+                                   "name: NonExistentExtractor"));
+
+  // No output should have been created due to the failure.
+  EXPECT_FALSE(llvm::sys::fs::exists(Output));
+}
+
+TEST_F(TUSummaryExtractorFrontendActionTest,
+       RunnerSucceeds_ASTConsumerCallbacksPropagate) {
+  std::string Output = makePath("output.MockSerializationFormat");
+  Compiler->getFrontendOpts().SSAFTUSummaryFile = Output;
+  Compiler->getFrontendOpts().SSAFExtractSummaries = {"NoOpExtractor"};
+
+  auto Wrapped = std::make_unique<RecordingAction>();
+  const EventLog &Log = Wrapped->getLog();
+  TUSummaryExtractorFrontendAction ExtractorAction(std::move(Wrapped));
+  EXPECT_TRUE(Compiler->ExecuteAction(ExtractorAction));
+
+  // All wrapped ASTConsumer callbacks should have fired, not just
+  // HandleTranslationUnit.
+  EXPECT_THAT(Log, Contains("Wrapped::Initialize"));
+  EXPECT_THAT(Log, Contains("Wrapped::HandleTopLevelDecl"));
+  EXPECT_THAT(Log, Contains("Wrapped::HandleTranslationUnit"));
+  EXPECT_EQ(DiagBuf.getNumErrors(), 0U);
+
+  // The runner should have written output.
+  EXPECT_TRUE(llvm::sys::fs::exists(Output));
+}
+
+// Use a custom action that checks whether the output path exists during
+// HandleTranslationUnit — it should not, because the wrapped consumer runs
+// before the runner.
+struct OrderCheckingAction : public ASTFrontendAction {
+  EventLog Log;
+  std::string OutputPath;
+
+  std::unique_ptr<ASTConsumer> CreateASTConsumer(CompilerInstance &CI,
+                                                 StringRef InFile) override {
+    struct Consumer : public ASTConsumer {
+      Consumer(EventLog &Log, std::string OutputPath)
+          : Log(Log), OutputPath(std::move(OutputPath)) {}
+      void Initialize(ASTContext &) override {
+        Log.push_back("Wrapped::Initialize");
+      }
+      bool HandleTopLevelDecl(DeclGroupRef) override {
+        Log.push_back("Wrapped::HandleTopLevelDecl");
+        return true;
+      }
+      void HandleTranslationUnit(ASTContext &) override {
+        bool Exists = llvm::sys::fs::exists(OutputPath);
+        Log.push_back(std::string("OutputExistsDuringWrappedHTU=") +
+                      (Exists ? "true" : "false"));
+        Log.push_back("Wrapped::HandleTranslationUnit");
+      }
+
+      EventLog &Log;
+      std::string OutputPath;
+    };
+    return std::make_unique<Consumer>(Log, OutputPath);
+  }
+};
+TEST_F(TUSummaryExtractorFrontendActionTest,
+       RunnerSucceeds_WrappedRunsBeforeRunner) {
+  std::string Output = makePath("output.MockSerializationFormat");
+  Compiler->getFrontendOpts().SSAFTUSummaryFile = Output;
+  Compiler->getFrontendOpts().SSAFExtractSummaries = {"NoOpExtractor"};
+
+  auto Wrapped = std::make_unique<OrderCheckingAction>();
+  Wrapped->OutputPath = Output;
+  const EventLog &Log = Wrapped->Log;
+  TUSummaryExtractorFrontendAction Action(std::move(Wrapped));
+
+  EXPECT_TRUE(Compiler->ExecuteAction(Action));
+  EXPECT_EQ(DiagBuf.getNumErrors(), 0U);
+
+  // The output should NOT have existed when the wrapped consumer's
+  // HandleTranslationUnit ran (wrapped is at index 0, runner at index 1).
+  EXPECT_THAT(Log, Contains("OutputExistsDuringWrappedHTU=false"));
+
+  // After ExecuteAction, the output should exist.
+  EXPECT_TRUE(llvm::sys::fs::exists(Output));
+}
+
+TEST_F(TUSummaryExtractorFrontendActionTest, RunnerFailsToWrite) {
+  std::string Output = makePath("output.FailingSerializationFormat");
+  Compiler->getFrontendOpts().SSAFTUSummaryFile = Output;
+  Compiler->getFrontendOpts().SSAFExtractSummaries = {"NoOpExtractor"};
+
+  TUSummaryExtractorFrontendAction Action(std::make_unique<RecordingAction>());
+
+  // This should fail because the summary writing fails and emits an error
+  // diagnostic.
+  EXPECT_FALSE(Compiler->ExecuteAction(Action));
+  EXPECT_THAT(
+      errorsMsgsOf(DiagBuf),
+      UnorderedElementsAre(
+          "failed to write TU summary to '" + Output +
+          "': error from always failing serialization format: writeTUSummary"));
+
+  // No output should have been created due to the failure.
+  EXPECT_FALSE(llvm::sys::fs::exists(Output));
+}
+
+} // namespace
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/Registries/FancyAnalysisData.cpp b/clang/unittests/ScalableStaticAnalysisFramework/Registries/FancyAnalysisData.cpp
index 084835190f7bd..313c53518dfe8 100644
--- a/clang/unittests/ScalableStaticAnalysisFramework/Registries/FancyAnalysisData.cpp
+++ b/clang/unittests/ScalableStaticAnalysisFramework/Registries/FancyAnalysisData.cpp
@@ -54,6 +54,8 @@ struct FancyAnalysisFormatInfo final : FormatInfo {
 };
 } // namespace
 
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+volatile int SSAFFancyAnalysisDataAnchorSource = 0;
 static llvm::Registry<FormatInfo>::Add<FancyAnalysisFormatInfo>
     RegisterFormatInfo("FancyAnalysisData",
                        "Format info for FancyAnalysisData for the "
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSerializationFormat.cpp b/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSerializationFormat.cpp
index e7a3e90e0bb31..535b5fced0da6 100644
--- a/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSerializationFormat.cpp
+++ b/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSerializationFormat.cpp
@@ -160,6 +160,8 @@ llvm::Error MockSerializationFormat::writeTUSummary(const TUSummary &Summary,
   return llvm::Error::success();
 }
 
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+volatile int SSAFMockSerializationFormatAnchorSource = 0;
 static SerializationFormatRegistry::Add<MockSerializationFormat>
     RegisterFormat("MockSerializationFormat",
                    "A serialization format for testing");
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor1.cpp b/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor1.cpp
index 7f4e9a91febbb..1bce78c8b1030 100644
--- a/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor1.cpp
+++ b/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor1.cpp
@@ -38,7 +38,9 @@ class MockSummaryExtractor1 : public TUSummaryExtractor {
   }
 };
 
+} // namespace
+
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+volatile int SSAFMockSummaryExtractor1AnchorSource = 0;
 static TUSummaryExtractorRegistry::Add<MockSummaryExtractor1>
     RegisterExtractor("MockSummaryExtractor1", "Mock summary extractor 1");
-
-} // namespace
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor2.cpp b/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor2.cpp
index 640228e2b7e2c..242f427f5e346 100644
--- a/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor2.cpp
+++ b/clang/unittests/ScalableStaticAnalysisFramework/Registries/MockSummaryExtractor2.cpp
@@ -38,7 +38,9 @@ class MockSummaryExtractor2 : public TUSummaryExtractor {
   }
 };
 
+} // namespace
+
+// NOLINTNEXTLINE(misc-use-internal-linkage)
+volatile int SSAFMockSummaryExtractor2AnchorSource = 0;
 static TUSummaryExtractorRegistry::Add<MockSummaryExtractor2>
     RegisterExtractor("MockSummaryExtractor2", "Mock summary extractor 2");
-
-} // namespace
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/Registries/SummaryExtractorRegistryTest.cpp b/clang/unittests/ScalableStaticAnalysisFramework/Registries/SummaryExtractorRegistryTest.cpp
index 5211319063b60..2018beebd53da 100644
--- a/clang/unittests/ScalableStaticAnalysisFramework/Registries/SummaryExtractorRegistryTest.cpp
+++ b/clang/unittests/ScalableStaticAnalysisFramework/Registries/SummaryExtractorRegistryTest.cpp
@@ -41,6 +41,7 @@ TEST(SummaryExtractorRegistryTest, EnumeratingRegistryEntries) {
   EXPECT_EQ(ActualNames, (std::set<llvm::StringRef>{
                              "MockSummaryExtractor1",
                              "MockSummaryExtractor2",
+                             "NoOpExtractor",
                          }));
 }
 
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/SSAFBuiltinTestForceLinker.h b/clang/unittests/ScalableStaticAnalysisFramework/SSAFBuiltinTestForceLinker.h
new file mode 100644
index 0000000000000..05d96af80cb27
--- /dev/null
+++ b/clang/unittests/ScalableStaticAnalysisFramework/SSAFBuiltinTestForceLinker.h
@@ -0,0 +1,51 @@
+//===- SSAFBuiltinTestForceLinker.h -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file pulls in all test-only SSAF mock extractor and format
+/// registrations by referencing their anchor symbols.
+///
+/// Include this header (with IWYU pragma: keep) in a translation unit that
+/// is compiled into the SSAF unittest binary.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_UNITTESTS_SCALABLESTATICANALYSISFRAMEWORK_SSAFBUILTINTESTFORCELINKER_H
+#define LLVM_CLANG_UNITTESTS_SCALABLESTATICANALYSISFRAMEWORK_SSAFBUILTINTESTFORCELINKER_H
+
+// Force the linker to link NoOpExtractor registration.
+extern volatile int SSAFNoOpExtractorAnchorSource;
+[[maybe_unused]] static int SSAFNoOpExtractorAnchorDestination =
+    SSAFNoOpExtractorAnchorSource;
+
+// Force the linker to link MockSummaryExtractor1 registration.
+extern volatile int SSAFMockSummaryExtractor1AnchorSource;
+[[maybe_unused]] static int SSAFMockSummaryExtractor1AnchorDestination =
+    SSAFMockSummaryExtractor1AnchorSource;
+
+// Force the linker to link MockSummaryExtractor2 registration.
+extern volatile int SSAFMockSummaryExtractor2AnchorSource;
+[[maybe_unused]] static int SSAFMockSummaryExtractor2AnchorDestination =
+    SSAFMockSummaryExtractor2AnchorSource;
+
+// Force the linker to link FailingSerializationFormat registration.
+extern volatile int SSAFFailingSerializationFormatAnchorSource;
+[[maybe_unused]] static int SSAFFailingSerializationFormatAnchorDestination =
+    SSAFFailingSerializationFormatAnchorSource;
+
+// Force the linker to link MockSerializationFormat registration.
+extern volatile int SSAFMockSerializationFormatAnchorSource;
+[[maybe_unused]] static int SSAFMockSerializationFormatAnchorDestination =
+    SSAFMockSerializationFormatAnchorSource;
+
+// Force the linker to link FancyAnalysisData format info registration.
+extern volatile int SSAFFancyAnalysisDataAnchorSource;
+[[maybe_unused]] static int SSAFFancyAnalysisDataAnchorDestination =
+    SSAFFancyAnalysisDataAnchorSource;
+
+#endif // LLVM_CLANG_UNITTESTS_SCALABLESTATICANALYSISFRAMEWORK_SSAFBUILTINTESTFORCELINKER_H
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/SSAFTestForceLinker.h b/clang/unittests/ScalableStaticAnalysisFramework/SSAFTestForceLinker.h
new file mode 100644
index 0000000000000..dd2077569a4eb
--- /dev/null
+++ b/clang/unittests/ScalableStaticAnalysisFramework/SSAFTestForceLinker.h
@@ -0,0 +1,23 @@
+//===- SSAFTestForceLinker.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file pulls in all test-only SSAF mock extractor and format
+/// registrations by referencing their anchor symbols.
+///
+/// Include this header (with IWYU pragma: keep) in a translation unit that
+/// is compiled into the SSAF unittest binary.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_UNITTESTS_SCALABLESTATICANALYSISFRAMEWORK_SSAFTESTFORCELINKER_H
+#define LLVM_CLANG_UNITTESTS_SCALABLESTATICANALYSISFRAMEWORK_SSAFTESTFORCELINKER_H
+
+#include "SSAFBuiltinTestForceLinker.h" // IWYU pragma: keep
+
+#endif // LLVM_CLANG_UNITTESTS_SCALABLESTATICANALYSISFRAMEWORK_SSAFTESTFORCELINKER_H
diff --git a/clang/unittests/ScalableStaticAnalysisFramework/TestFixture.cpp b/clang/unittests/ScalableStaticAnalysisFramework/TestFixture.cpp
index c1c41997abcf2..772eaf069a350 100644
--- a/clang/unittests/ScalableStaticAnalysisFramework/TestFixture.cpp
+++ b/clang/unittests/ScalableStaticAnalysisFramework/TestFixture.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TestFixture.h"
+#include "SSAFBuiltinTestForceLinker.h" // IWYU pragma: keep
 #include "clang/ScalableStaticAnalysisFramework/Core/Model/BuildNamespace.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityId.h"
 #include "clang/ScalableStaticAnalysisFramework/Core/Model/EntityLinkage.h"
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index 6c920773aba61..aca71728e676c 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -331,6 +331,11 @@ <h2 id="c2y">C2y implementation status</h2>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3457.htm">N3457</a></td>
       <td class="unreleased" align="center">Clang 22</td>
 	</tr>
+    <tr>
+      <td>Array subscripting without decay</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3517.htm">N3517</a></td>
+      <td class="none" align="center">No</td>
+	</tr>
     <tr>
       <td>Chasing Ghosts I: constant expressions v2</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3558.htm">N3558</a></td>
@@ -361,6 +366,58 @@ <h2 id="c2y">C2y implementation status</h2>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3532.pdf">N3532</a></td>
       <td class="full" align="center">Yes</td>
 	</tr>
+    <tr>
+      <td>Composite types v1.3</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3652.pdf">N3652</a></td>
+      <td class="none" align="center">No</td>
+	</tr>
+    <!-- Virtual Feb 2026 Papers -->
+    <tr>
+      <td>Retire the concept of consume operations</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3607.htm">N3607</a></td>
+      <td class="unknown" align="center">Unknown</td>
+    </tr>
+    <tr>
+      <td>Allow calling static inline within extern inline</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3622.txt">N3622</a></td>
+      <td class="unknown" align="center">Unknown</td>
+    </tr>
+    <tr>
+      <td>Static assertions in expressions</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3715.pdf">N3715</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Integer Sets</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3747.pdf">N3747</a></td>
+      <td class="unknown" align="center">Unknown</td>
+    </tr>
+    <tr>
+      <td>Remove the imaginary I</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3786.htm">N3786</a></td>
+      <td class="unknown" align="center">Unknown</td>
+    </tr>
+    <tr>
+      <td>Earthly Demon: Accessing a Member of an Atomic Structure or Union</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3653.pdf">N3653</a></td>
+      <td class="unknown" align="center">Unknown</td>
+    </tr>
+    <!-- Virtual Mar 2026 Papers -->
+    <tr>
+      <td>Integer Constant Expression-Initialized const Integer Declarations are Implicitly constexpr</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3693.htm">N3693</a></td>
+      <td class="unknown" align="center">Unknown</td>
+    </tr>
+    <tr>
+      <td>bit-precise enum</td>
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3705.htm">N3705</a></td>
+      <td class="none" align="center">No</td>
+    </tr>
+    <tr>
+      <td>Generic replacement</td> <!-- both changes from paper were adopted -->
+      <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3722.pdf">N3722</a></td>
+      <td class="unknown" align="center">Unknown</td>
+    </tr>
 </table>
 </details>
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
index c719e2a8ef600..49b5602d227a5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -36,13 +36,14 @@
 #endif
 # define SANITIZER_WEAK_ATTRIBUTE
 #  define SANITIZER_WEAK_IMPORT
-#elif SANITIZER_GO
-# define SANITIZER_INTERFACE_ATTRIBUTE
-# define SANITIZER_WEAK_ATTRIBUTE
-#  define SANITIZER_WEAK_IMPORT
 #else
-# define SANITIZER_INTERFACE_ATTRIBUTE __attribute__((visibility("default")))
-# define SANITIZER_WEAK_ATTRIBUTE  __attribute__((weak))
+#  if SANITIZER_GO
+#    define SANITIZER_INTERFACE_ATTRIBUTE
+#    define SANITIZER_WEAK_ATTRIBUTE
+#  else
+#    define SANITIZER_INTERFACE_ATTRIBUTE __attribute__((visibility("default")))
+#    define SANITIZER_WEAK_ATTRIBUTE __attribute__((weak))
+#  endif  // SANITIZER_GO
 #  if SANITIZER_APPLE
 #    define SANITIZER_WEAK_IMPORT extern "C" __attribute((weak_import))
 #  else
diff --git a/flang/include/flang/Lower/CUDA.h b/flang/include/flang/Lower/CUDA.h
index 83523c21380d6..865a7c6a6fa78 100644
--- a/flang/include/flang/Lower/CUDA.h
+++ b/flang/include/flang/Lower/CUDA.h
@@ -62,14 +62,6 @@ cuf::DataAttributeAttr
 translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
                                 const Fortran::semantics::Symbol &sym);
 
-/// Create a cuf.alloc operation with extents and length parameters elided
-/// when they are already encoded in the static type.
-mlir::Value genCUFAlloc(fir::FirOpBuilder &builder, mlir::Location loc,
-                        mlir::Type type, llvm::StringRef uniqName,
-                        llvm::StringRef bindcName,
-                        cuf::DataAttributeAttr dataAttr,
-                        mlir::ValueRange lenParams, mlir::ValueRange extents);
-
 /// Check if the rhs has an implicit conversion. Return the elemental op if
 /// there is a conversion. Return null otherwise.
 std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
diff --git a/flang/include/flang/Semantics/openmp-directive-sets.h b/flang/include/flang/Semantics/openmp-directive-sets.h
index 609a7be700c28..5e9979d032028 100644
--- a/flang/include/flang/Semantics/openmp-directive-sets.h
+++ b/flang/include/flang/Semantics/openmp-directive-sets.h
@@ -278,12 +278,14 @@ static const OmpDirectiveSet loopConstructSet{
     Directive::OMPD_fuse,
     Directive::OMPD_tile,
     Directive::OMPD_unroll,
+    Directive::OMPD_interchange,
 };
 
 static const OmpDirectiveSet loopTransformationSet{
     Directive::OMPD_tile,
     Directive::OMPD_unroll,
     Directive::OMPD_fuse,
+    Directive::OMPD_interchange,
 };
 
 static const OmpDirectiveSet nonPartialVarSet{
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index fa0528bea5114..cd599237e4182 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -16,6 +16,7 @@
 #include "flang/Common/indirection.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/char-block.h"
+#include "flang/Parser/message.h"
 #include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Parser/tools.h"
@@ -25,8 +26,10 @@
 
 #include <optional>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 namespace Fortran::semantics {
 class Scope;
@@ -109,29 +112,79 @@ bool IsPointerAssignment(const evaluate::Assignment &x);
 
 MaybeExpr MakeEvaluateExpr(const parser::OmpStylizedInstance &inp);
 
+/// A representation of a "because" message.
+struct Reason {
+  parser::Messages msgs;
+
+  template <typename... Ts> Reason &Say(Ts &&...args) {
+    msgs.Say(std::forward<Ts>(args)...);
+    return *this;
+  };
+  operator bool() const { return !msgs.empty(); }
+  parser::Message &AttachTo(parser::Message &msg);
+};
+
+std::pair<std::optional<int64_t>, Reason> GetArgumentValueWithReason(
+    const parser::OmpDirectiveSpecification &spec, llvm::omp::Clause clauseId,
+    unsigned version);
+std::pair<std::optional<int64_t>, Reason> GetNumArgumentsWithReason(
+    const parser::OmpDirectiveSpecification &spec, llvm::omp::Clause clauseId,
+    unsigned version);
+
 bool IsLoopTransforming(llvm::omp::Directive dir);
 bool IsFullUnroll(const parser::OpenMPLoopConstruct &x);
 
+std::optional<int64_t> GetNumGeneratedNestsFrom(
+    const parser::ExecutionPartConstruct &epc,
+    std::optional<int64_t> nestedCount);
+
+// Return the depth of the affected nests:
+//   {affected-depth, must-be-perfect-nest, reason}.
+std::tuple<std::optional<int64_t>, bool, Reason> GetAffectedNestDepthWithReason(
+    const parser::OpenMPLoopConstruct &x, unsigned version);
+// Return the range of the affected nests in the sequence:
+//   {first, count, reason}.
+// If the range is "the whole sequence", the return value will be {1, -1, ...}.
+std::tuple<std::optional<int64_t>, std::optional<int64_t>, Reason>
+GetAffectedLoopRangeWithReason(
+    const parser::OpenMPLoopConstruct &x, unsigned version);
+
+// Count the required loop count from range. If count == -1, return -1,
+// indicating all loops in the sequence.
+std::optional<int64_t> GetRequiredCount(
+    std::optional<int64_t> first, std::optional<int64_t> count);
+
 struct LoopSequence {
-  LoopSequence(
-      const parser::ExecutionPartConstruct &root, bool allowAllLoops = false);
+  LoopSequence(const parser::ExecutionPartConstruct &root, unsigned version,
+      bool allowAllLoops = false);
 
   template <typename R, typename = std::enable_if_t<is_range_v<R>>>
-  LoopSequence(const R &range, bool allowAllLoops = false)
-      : allowAllLoops_(allowAllLoops) {
+  LoopSequence(const R &range, unsigned version, bool allowAllLoops = false)
+      : version_(version), allowAllLoops_(allowAllLoops) {
     entry_ = std::make_unique<Construct>(range, nullptr);
     createChildrenFromRange(entry_->location);
-    length_ = calculateLength();
+    precalculate();
   }
 
+  struct Depth {
+    // If this sequence is a nest, the depth of the Canonical Loop Nest rooted
+    // at this sequence. Otherwise unspecified.
+    std::optional<int64_t> semantic;
+    // If this sequence is a nest, the depth of the perfect Canonical Loop Nest
+    // rooted at this sequence. Otherwise unspecified.
+    std::optional<int64_t> perfect;
+  };
+
   bool isNest() const { return length_ && *length_ == 1; }
   std::optional<int64_t> length() const { return length_; }
+  Depth depth() const { return depth_; }
   const std::vector<LoopSequence> &children() const { return children_; }
 
 private:
   using Construct = ExecutionPartIterator::Construct;
 
-  LoopSequence(std::unique_ptr<Construct> entry, bool allowAllLoops);
+  LoopSequence(
+      std::unique_ptr<Construct> entry, unsigned version, bool allowAllLoops);
 
   template <typename R, typename = std::enable_if_t<is_range_v<R>>>
   void createChildrenFromRange(const R &range) {
@@ -145,15 +198,31 @@ struct LoopSequence {
       ExecutionPartIterator::IteratorType begin,
       ExecutionPartIterator::IteratorType end);
 
-  std::optional<int64_t> calculateLength() const;
-  std::optional<int64_t> sumOfChildrenLengths() const;
+  /// Precalculate length and depth.
+  void precalculate();
 
-  // Precalculated length of the sequence. Note that this is different from
-  // the number of children because a child may result in a sequence, for
-  // example a fuse with a reduced loop range. The length of that sequence
-  // adds to the length of the owning LoopSequence.
+  std::optional<int64_t> calculateLength() const;
+  std::optional<int64_t> getNestedLength() const;
+  Depth calculateDepths() const;
+  Depth getNestedDepths() const;
+
+  /// True if the sequence contains any code (besides transformable loops)
+  /// that is not a valid intervening code.
+  bool hasInvalidIC_{false};
+  /// True if the sequence contains any code (besides transformable loops)
+  /// that is not a valid transparent code.
+  bool hasOpaqueIC_{false};
+
+  /// Precalculated length of the sequence. Note that this is different from
+  /// the number of children because a child may result in a sequence, for
+  /// example a fuse with a reduced loop range. The length of that sequence
+  /// adds to the length of the owning LoopSequence.
   std::optional<int64_t> length_;
+  /// Precalculated depths. Only meaningful if the sequence is a nest.
+  Depth depth_;
+
   // The core structure of the class:
+  unsigned version_; // Needed for GetXyzWithReason
   bool allowAllLoops_;
   std::unique_ptr<Construct> entry_;
   std::vector<LoopSequence> children_;
diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
index cbcb3592f04c3..aa2c4cdc6d10b 100644
--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -57,7 +57,8 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     ForwardRefExplicitTypeDummy, InaccessibleDeferredOverride,
     CudaWarpMatchFunction, DoConcurrentOffload, TransferBOZ, Coarray,
     PointerPassObject, MultipleIdenticalDATA,
-    DefaultStructConstructorNullPointer, AssumedRankIoItem)
+    DefaultStructConstructorNullPointer, AssumedRankIoItem,
+    MultipleProgramUnitsOnSameLine)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Lower/CUDA.cpp b/flang/lib/Lower/CUDA.cpp
index 8fd54bc6cfe2d..d8e2d829f9adf 100644
--- a/flang/lib/Lower/CUDA.cpp
+++ b/flang/lib/Lower/CUDA.cpp
@@ -68,24 +68,6 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
   return cuf::getDataAttribute(mlirContext, cudaAttr);
 }
 
-mlir::Value Fortran::lower::genCUFAlloc(fir::FirOpBuilder &builder,
-                                        mlir::Location loc, mlir::Type type,
-                                        llvm::StringRef uniqName,
-                                        llvm::StringRef bindcName,
-                                        cuf::DataAttributeAttr dataAttr,
-                                        mlir::ValueRange lenParams,
-                                        mlir::ValueRange extents) {
-  llvm::SmallVector<mlir::Value> elidedExtents =
-      fir::factory::elideExtentsAlreadyInType(type, extents);
-  llvm::SmallVector<mlir::Value> elidedLenParams =
-      fir::factory::elideLengthsAlreadyInType(type, lenParams);
-  auto idxTy = builder.getIndexType();
-  for (mlir::Value &ext : elidedExtents)
-    ext = builder.createConvert(loc, idxTy, ext);
-  return cuf::AllocOp::create(builder, loc, type, uniqName, bindcName, dataAttr,
-                              elidedLenParams, elidedExtents);
-}
-
 std::pair<hlfir::ElementalOp, hlfir::ElementalOp>
 Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
   auto isCopyElementalOp = [](hlfir::ElementalOp elOp) {
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index ec406c9997de2..0ededb364bfea 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -760,20 +760,21 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
     cuf::DataAttributeAttr dataAttr =
         Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
                                                         ultimateSymbol);
-    if (dataAttr.getValue() == cuf::DataAttribute::Shared) {
-      llvm::SmallVector<mlir::Value> elidedShape =
-          fir::factory::elideExtentsAlreadyInType(ty, shape);
-      auto idxTy = builder.getIndexType();
-      llvm::SmallVector<mlir::Value> indices;
-      for (mlir::Value sh : elidedShape)
-        indices.push_back(builder.createConvert(loc, idxTy, sh));
+    llvm::SmallVector<mlir::Value> indices;
+    llvm::SmallVector<mlir::Value> elidedShape =
+        fir::factory::elideExtentsAlreadyInType(ty, shape);
+    llvm::SmallVector<mlir::Value> elidedLenParams =
+        fir::factory::elideLengthsAlreadyInType(ty, lenParams);
+    auto idxTy = builder.getIndexType();
+    for (mlir::Value sh : elidedShape)
+      indices.push_back(builder.createConvert(loc, idxTy, sh));
+    if (dataAttr.getValue() == cuf::DataAttribute::Shared)
       return cuf::SharedMemoryOp::create(builder, loc, ty, nm, symNm, lenParams,
                                          indices);
-    }
 
     if (!cuf::isCUDADeviceContext(builder.getRegion()))
-      return Fortran::lower::genCUFAlloc(builder, loc, ty, nm, symNm, dataAttr,
-                                         lenParams, shape);
+      return cuf::AllocOp::create(builder, loc, ty, nm, symNm, dataAttr,
+                                  lenParams, indices);
   }
 
   // Let the builder do all the heavy lifting.
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index e2328b39c180f..ae5f6f50bda09 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2245,6 +2245,16 @@ static void genCanonicalLoopNest(
   firOpBuilder.setInsertionPointAfter(loops.front());
 }
 
+static void genInterchangeOp(Fortran::lower::AbstractConverter &converter,
+                             Fortran::lower::SymMap &symTable,
+                             lower::StatementContext &stmtCtx,
+                             Fortran::semantics::SemanticsContext &semaCtx,
+                             Fortran::lower::pft::Evaluation &eval,
+                             mlir::Location loc, const ConstructQueue &queue,
+                             ConstructQueue::const_iterator item) {
+  TODO(converter.getCurrentLocation(), "OpenMP Interchange");
+}
+
 static void genTileOp(Fortran::lower::AbstractConverter &converter,
                       Fortran::lower::SymMap &symTable,
                       lower::StatementContext &stmtCtx,
@@ -3740,6 +3750,10 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
     newOp = genTeamsOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue,
                        item);
     break;
+  case llvm::omp::Directive::OMPD_interchange:
+    genInterchangeOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue,
+                     item);
+    break;
   case llvm::omp::Directive::OMPD_tile:
     genTileOp(converter, symTable, stmtCtx, semaCtx, eval, loc, queue, item);
     break;
diff --git a/flang/lib/Lower/Support/PrivateReductionUtils.cpp b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
index d879a0b7e97aa..f63fb6ecfe43f 100644
--- a/flang/lib/Lower/Support/PrivateReductionUtils.cpp
+++ b/flang/lib/Lower/Support/PrivateReductionUtils.cpp
@@ -14,7 +14,6 @@
 
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Lower/Allocatable.h"
-#include "flang/Lower/CUDA.h"
 #include "flang/Lower/ConvertVariable.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/Character.h"
@@ -22,16 +21,21 @@
 #include "flang/Optimizer/Builder/HLFIRTools.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
-#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Optimizer/Support/FatalError.h"
 #include "flang/Semantics/symbol.h"
-#include "flang/Semantics/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/Location.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<bool> enableGPUHeapAlloc(
+    "enable-gpu-heap-alloc",
+    llvm::cl::desc(
+        "Allow the use of heap allocation for dynamically sized arrays on GPU"),
+    llvm::cl::init(false));
 
 static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
   if (sym.has<Fortran::semantics::ObjectEntityDetails>())
@@ -42,11 +46,11 @@ static bool hasFinalization(const Fortran::semantics::Symbol &sym) {
   return false;
 }
 
-static void createCleanupRegion(
-    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
-    mlir::Type argType, mlir::Region &cleanupRegion,
-    const Fortran::semantics::Symbol *sym, bool isDoConcurrent,
-    std::optional<cuf::DataAttributeAttr> cudaDataAttr = std::nullopt) {
+static void createCleanupRegion(Fortran::lower::AbstractConverter &converter,
+                                mlir::Location loc, mlir::Type argType,
+                                mlir::Region &cleanupRegion,
+                                const Fortran::semantics::Symbol *sym,
+                                bool isDoConcurrent) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   assert(cleanupRegion.empty());
   mlir::Block *block = builder.createBlock(&cleanupRegion, cleanupRegion.end(),
@@ -105,14 +109,9 @@ static void createCleanupRegion(
         fir::IfOp::create(builder, loc, isAllocated, /*withElseRegion=*/false);
     builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
 
-    if (cudaDataAttr) {
-      cuf::FreeOp::create(builder, loc, addr, *cudaDataAttr);
-    } else {
-      mlir::Value cast = builder.createConvert(
-          loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())),
-          addr);
-      fir::FreeMemOp::create(builder, loc, cast);
-    }
+    mlir::Value cast = builder.createConvert(
+        loc, fir::HeapType::get(fir::dyn_cast_ptrEleTy(addr.getType())), addr);
+    fir::FreeMemOp::create(builder, loc, cast);
 
     builder.setInsertionPointAfter(ifOp);
     if (isDoConcurrent)
@@ -391,7 +390,7 @@ class PopulateInitAndCleanupRegionsHelper {
     return loadedMoldArg;
   }
 
-  bool shouldAllocateTempOnStack() const;
+  bool shouldAllocateTempOnStack(fir::BaseBoxType boxTy) const;
 };
 
 } // namespace
@@ -454,7 +453,7 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
     builder.setInsertionPointToStart(&ifUnallocated.getElseRegion().front());
   }
 
-  bool shouldAllocateOnStack = shouldAllocateTempOnStack();
+  bool shouldAllocateOnStack = shouldAllocateTempOnStack(boxTy);
   mlir::Value valAlloc =
       (shouldAllocateOnStack)
           ? builder.createTemporary(loc, innerTy, /*name=*/{},
@@ -485,12 +484,22 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedScalar(
   createYield(allocatedPrivVarArg);
 }
 
-bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack() const {
-  // On the GPU, always allocate on the stack since heap allocatins are very
-  // expensive.
+bool PopulateInitAndCleanupRegionsHelper::shouldAllocateTempOnStack(
+    fir::BaseBoxType boxTy) const {
   auto offloadMod =
       llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(*builder.getModule());
-  return offloadMod && offloadMod.getIsGPU();
+  // On the GPU, always allocate on the stack unless the user explicitly
+  // specifies otherwise since heap allocatins are very expensive.
+  bool isGPU = offloadMod && offloadMod.getIsGPU();
+  if (isGPU && enableGPUHeapAlloc) {
+    // Check if it is adjustable array
+    if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(boxTy.getEleTy())) {
+      if (seqTy.hasUnknownShape() || seqTy.hasDynamicExtents()) {
+        return false;
+      }
+    }
+  }
+  return isGPU;
 }
 
 void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
@@ -535,34 +544,9 @@ void PopulateInitAndCleanupRegionsHelper::initAndCleanupBoxedArray(
   // Allocating on the heap in case the whole reduction/privatization is nested
   // inside of a loop
   auto temp = [&]() {
-    if (shouldAllocateTempOnStack())
+    if (shouldAllocateTempOnStack(boxTy))
       return createStackTempFromMold(loc, builder, source);
 
-    // For CUDA device arrays that require special allocation (device,
-    // managed, unified, etc.), use cuf.alloc instead of fir.allocmem so
-    // that the private copy lives in device memory.
-    if (sym && Fortran::semantics::NeedCUDAAlloc(sym->GetUltimate())) {
-      cuf::DataAttributeAttr dataAttr =
-          Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
-                                                          sym->GetUltimate());
-      mlir::Type sequenceType =
-          hlfir::getFortranElementOrSequenceType(source.getType());
-      mlir::Value shape = hlfir::genShape(loc, builder, source);
-      auto extents = hlfir::getIndexExtents(loc, builder, shape);
-      mlir::Value alloc = Fortran::lower::genCUFAlloc(
-          builder, loc, sequenceType, /*uniqName=*/"", /*bindcName=*/".tmp",
-          dataAttr, lenParams, extents);
-      auto declareOp = hlfir::DeclareOp::create(
-          builder, loc, alloc, ".tmp", shape, lenParams,
-          /*dummy_scope=*/nullptr, /*storage=*/nullptr, /*storage_offset=*/0,
-          fir::FortranVariableFlagsAttr{}, dataAttr);
-      hlfir::Entity temp{declareOp.getBase()};
-      mlir::OpBuilder::InsertionGuard guard(builder);
-      createCleanupRegion(converter, loc, argType, cleanupRegion, sym,
-                          isDoConcurrent, dataAttr);
-      return temp;
-    }
-
     auto [temp, needsDealloc] = createTempFromMold(loc, builder, source);
     // if needsDealloc, add cleanup region. Always
     // do this for allocatable boxes because they might have been re-allocated
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCUseDeviceCanonicalizer.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCUseDeviceCanonicalizer.cpp
index 51ab796021e4b..e251a089be012 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCUseDeviceCanonicalizer.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCUseDeviceCanonicalizer.cpp
@@ -157,6 +157,8 @@ struct UseDeviceHostDataHoisting : public OpRewritePattern<acc::HostDataOp> {
                                         acc::UseDeviceOp useDeviceOp,
                                         acc::HostDataOp hostDataOp,
                                         fir::BoxAddrOp boxAddr) const {
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(hostDataOp);
     // Create use_device on the raw pointer
     acc::UseDeviceOp newUseDeviceOp = acc::UseDeviceOp::create(
         rewriter, useDeviceOp.getLoc(), boxAddr.getType(), boxAddr.getResult(),
@@ -364,9 +366,14 @@ struct UseDeviceHostDataHoisting : public OpRewritePattern<acc::HostDataOp> {
     for (mlir::Operation *user : usersToUpdate)
       user->replaceUsesOfWith(useDeviceOp.getResult(), newBoxWithDevicePtr);
 
-    assert(useDeviceOp.getResult().use_empty() &&
-           "expected all uses of use_device to be replaced");
-    rewriter.eraseOp(useDeviceOp);
+    // Remove the use_device operation if it is no longer needed.
+    if (useDeviceOp.getResult().use_empty()) {
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "ACCUseDeviceCanonicalizer: Removing dead use_device operation: "
+          << *useDeviceOp << "\n");
+      rewriter.eraseOp(useDeviceOp);
+    }
     return true;
   }
 };
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 64a12c8fdec7b..3b5d43db56ec2 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1634,7 +1634,7 @@ TYPE_PARSER( //
     "SIZES" >> construct<OmpClause>(construct<OmpClause::Sizes>(
                    parenthesized(nonemptyList(scalarIntExpr)))) ||
     "PERMUTATION" >> construct<OmpClause>(construct<OmpClause::Permutation>(
-                         parenthesized(nonemptyList(scalarIntExpr)))) ||
+                         parenthesized(nonemptyList(scalarIntConstantExpr)))) ||
     "THREADS"_id >> construct<OmpClause>(construct<OmpClause::Threads>()) ||
     "THREADSET" >> construct<OmpClause>(construct<OmpClause::Threadset>(
                        parenthesized(Parser<OmpThreadsetClause>{}))) ||
@@ -2549,6 +2549,7 @@ static constexpr DirectiveSet GetLoopDirectives() {
       unsigned(Directive::OMPD_fuse),
       unsigned(Directive::OMPD_tile),
       unsigned(Directive::OMPD_unroll),
+      unsigned(Directive::OMPD_interchange),
   };
   return loopDirectives;
 }
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index 775810f8c3c0f..b26603b5aea45 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -68,9 +68,18 @@ static constexpr auto programUnit{
         construct<ProgramUnit>(indirect(functionSubprogram)) ||
     construct<ProgramUnit>(indirect(Parser<MainProgram>{}))};
 
-static constexpr auto normalProgramUnit{!consumedAllInput >>
-    StartNewSubprogram{} >>
-    programUnit / recovery(endOfStmt, skipToNextLineIfAny)};
+// Note, F'23 6.3.1 states that "A Fortran program unit is a sequence of one or
+// more lines, organized as Fortran statements, comments, and INCLUDE lines."
+// which could be interpreted as implying program units must exist on mutually
+// exclusive lines. Nag interprets it this way. We have an extension to allow
+// multiple program units on the same line.
+static constexpr auto normalProgramUnit{
+    !consumedAllInput >> StartNewSubprogram{} >> programUnit /
+        recovery((maybe(semicolons) >> endOfLine) ||
+                (extension<LanguageFeature::MultipleProgramUnitsOnSameLine>(
+                    "nonstandard usage: end of program unit not terminated by new line"_port_en_US,
+                    semicolons >> not(endOfLine))),
+            skipToNextLineIfAny)};
 
 static constexpr auto globalCompilerDirective{
     construct<ProgramUnit>(indirect(compilerDirective))};
diff --git a/flang/lib/Parser/stmt-parser.h b/flang/lib/Parser/stmt-parser.h
index b2bb8dd843642..a003f6ea99d96 100644
--- a/flang/lib/Parser/stmt-parser.h
+++ b/flang/lib/Parser/stmt-parser.h
@@ -37,10 +37,10 @@ constexpr auto checkEndOfKnownStmt{recovery(atEndOfStmt, SkipTo<'\n'>{})};
 constexpr auto endOfLine{consumedAllInput ||
     withMessage("expected end of line"_err_en_US, "\n"_ch >> ok)};
 
-constexpr auto semicolons{";"_ch >> skipMany(";"_tok) / space / maybe("\n"_ch)};
+constexpr auto semicolons{";"_ch >> skipMany(";"_tok) / space};
 constexpr auto endOfStmt{
     space >> withMessage("expected end of statement"_err_en_US,
-                 semicolons || endOfLine)};
+                 (semicolons / maybe(endOfLine)) || endOfLine)};
 constexpr auto skipToNextLineIfAny{consumedAllInput || SkipPast<'\n'>{}};
 constexpr auto forceEndOfStmt{recovery(endOfStmt, skipToNextLineIfAny)};
 
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index 8cb07ed352a01..01d07e30d3ce5 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -28,6 +28,7 @@
 #include "flang/Semantics/tools.h"
 #include "flang/Semantics/type.h"
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Frontend/OpenMP/OMP.h"
 
 #include <cinttypes>
@@ -38,10 +39,6 @@
 #include <tuple>
 #include <variant>
 
-namespace Fortran::semantics {
-static std::optional<int64_t> GetNumGeneratedNests(const parser::Block &block);
-} // namespace Fortran::semantics
-
 namespace {
 using namespace Fortran;
 
@@ -245,19 +242,11 @@ void OmpStructureChecker::CheckSIMDNest(const parser::OpenMPConstruct &c) {
   }
 }
 
-static std::optional<int64_t> GetNumGeneratedNests(const parser::Block &block) {
-  // Count the number of loops in the associated block. If there are any
-  // malformed construct in there, getting the number may be meaningless.
-  // These issues will be diagnosed elsewhere, and we should not emit any
-  // messages about a potentially incorrect loop count.
-  // In such cases reset the count to nullopt. Once it becomes nullopt,
-  // keep it that way.
-  return LoopSequence(block, true).length();
-}
-
 void OmpStructureChecker::CheckNestedConstruct(
     const parser::OpenMPLoopConstruct &x) {
   const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  llvm::omp::Directive dir{beginSpec.DirId()};
+  unsigned version{context_.langOptions().OpenMPVersion};
 
   // End-directive is not allowed in such cases:
   //   do 100 i = ...
@@ -280,8 +269,8 @@ void OmpStructureChecker::CheckNestedConstruct(
   // Check constructs contained in the body of the loop construct.
   auto &body{std::get<parser::Block>(x.t)};
   for (auto &stmt : BlockRange(body, BlockRange::Step::Over)) {
-    if (auto *dir{parser::Unwrap<parser::CompilerDirective>(stmt)}) {
-      context_.Say(dir->source,
+    if (auto *d{parser::Unwrap<parser::CompilerDirective>(stmt)}) {
+      context_.Say(d->source,
           "Compiler directives are not allowed inside OpenMP loop constructs"_warn_en_US);
     } else if (auto *omp{parser::Unwrap<parser::OpenMPLoopConstruct>(stmt)}) {
       if (!IsLoopTransforming(omp->BeginDir().DirId())) {
@@ -299,22 +288,71 @@ void OmpStructureChecker::CheckNestedConstruct(
     }
   }
 
+  LoopSequence sequence(body, version, true);
+
   // Check if a loop-nest-associated construct has only one top-level loop
   // in it.
-  if (std::optional<int64_t> numLoops{GetNumGeneratedNests(body)}) {
+  auto [needFirst, needCount, rangeReason]{
+      GetAffectedLoopRangeWithReason(x, version)};
+
+  if (std::optional<int64_t> numLoops{sequence.length()}) {
     if (*numLoops == 0) {
       context_.Say(beginSpec.DirName().source,
           "This construct should contain a DO-loop or a loop-nest-generating OpenMP construct"_err_en_US);
     } else {
-      auto assoc{llvm::omp::getDirectiveAssociation(beginSpec.DirName().v)};
+      auto assoc{llvm::omp::getDirectiveAssociation(dir)};
       if (*numLoops > 1 && assoc == llvm::omp::Association::LoopNest) {
         context_.Say(beginSpec.DirName().source,
             "This construct applies to a loop nest, but has a loop sequence of "
             "length %" PRId64 ""_err_en_US,
             *numLoops);
       }
+      if (assoc == llvm::omp::Association::LoopSeq) {
+        if (auto requiredCount{GetRequiredCount(needFirst, needCount)}) {
+          if (*requiredCount > 0 && *numLoops < *requiredCount) {
+            auto &msg{context_.Say(beginSpec.DirName().source,
+                "This construct requires a sequence of %" PRId64
+                " loops, but the loop sequence has a length of %" PRId64
+                ""_err_en_US,
+                *requiredCount, *numLoops)};
+            rangeReason.AttachTo(msg);
+          }
+        }
+      }
     }
   }
+
+  // Check requirements on nest depth.
+  auto [needDepth, needPerfect, depthReason]{
+      GetAffectedNestDepthWithReason(x, version)};
+  auto [haveSema, havePerf]{sequence.depth()};
+
+  if (dir != llvm::omp::Directive::OMPD_fuse) {
+    auto haveDepth = needPerfect ? havePerf : haveSema;
+    // If the present depth is 0, it's likely that the construct doesn't
+    // have any loops in it, which would be diagnosed above.
+    if (needDepth && haveDepth && *haveDepth > 0) {
+      if (*needDepth > *haveDepth) {
+        if (needPerfect) {
+          auto &msg{context_.Say(beginSpec.DirName().source,
+              "This construct requires a perfect nest of depth %" PRId64
+              ", but the associated nest is a perfect nest of depth %" PRId64
+              ""_err_en_US,
+              *needDepth, *haveDepth)};
+          depthReason.AttachTo(msg);
+        } else {
+          auto &msg{context_.Say(beginSpec.DirName().source,
+              "This construct requires a nest of depth %" PRId64
+              ", but the associated nest has a depth of %" PRId64 ""_err_en_US,
+              *needDepth, *haveDepth)};
+          depthReason.AttachTo(msg);
+        }
+      }
+    }
+  } else {
+    // FUSE requires a sequence of perfect nests.
+    // TODO: Defer this check for now.
+  }
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPLoopConstruct &x) {
@@ -514,29 +552,6 @@ void OmpStructureChecker::CheckDistLinear(
   }
 }
 
-void OmpStructureChecker::CheckLooprangeBounds(
-    const parser::OpenMPLoopConstruct &x) {
-  if (auto *clause{parser::omp::FindClause(
-          x.BeginDir(), llvm::omp::Clause::OMPC_looprange)}) {
-    auto *lrClause{parser::Unwrap<parser::OmpLooprangeClause>(clause)};
-    auto first{GetIntValue(std::get<0>(lrClause->t))};
-    auto count{GetIntValue(std::get<1>(lrClause->t))};
-    if (!first || !count || *first <= 0 || *count <= 0) {
-      return;
-    }
-    int64_t requiredCount{*first + *count - 1};
-    if (auto loopCount{GetNumGeneratedNests(std::get<parser::Block>(x.t))}) {
-      if (*loopCount < requiredCount) {
-        context_.Say(clause->source,
-            "The specified loop range requires %" PRId64
-            " loops, but the loop sequence has a length of %" PRId64
-            ""_err_en_US,
-            requiredCount, *loopCount);
-      }
-    }
-  }
-}
-
 void OmpStructureChecker::CheckScanModifier(
     const parser::OmpClause::Reduction &x) {
   using ReductionModifier = parser::OmpReductionModifier;
@@ -585,9 +600,6 @@ void OmpStructureChecker::Leave(const parser::OpenMPLoopConstruct &x) {
       CheckScanModifier(*reduction);
     }
   }
-  if (beginSpec.DirName().v == llvm::omp::Directive::OMPD_fuse) {
-    CheckLooprangeBounds(x);
-  }
   if (llvm::omp::allSimdSet.test(beginSpec.DirName().v)) {
     ExitDirectiveNest(SIMDNest);
   }
@@ -769,6 +781,40 @@ void OmpStructureChecker::Enter(const parser::OmpClause::Sizes &c) {
         /*paramName=*/"parameter", /*allowZero=*/false);
 }
 
+void OmpStructureChecker::Enter(const parser::OmpClause::Permutation &c) {
+  unsigned version{context_.langOptions().OpenMPVersion};
+  llvm::omp::Clause clause = llvm::omp::Clause::OMPC_permutation;
+  CheckAllowedClause(clause);
+  if (c.v.size() < 2)
+    context_.Say(GetContext().clauseSource,
+        "The %s clause must have a length of at least two"_err_en_US,
+        parser::omp::GetUpperName(clause, version));
+
+  llvm::BitVector found(c.v.size(), false);
+  bool cont = true;
+  for (const auto &val : c.v) {
+    if (const auto v{GetIntValue(val)}) {
+      if (*v <= 0) {
+        cont = false;
+        context_.Say(GetContext().clauseSource,
+            "The parameter of the %s clause must be a constant positive integer expression"_err_en_US,
+            parser::omp::GetUpperName(clause, version));
+      } else if ((unsigned)*v - 1 < c.v.size()) {
+        found.set(*v - 1);
+      }
+    } else
+      cont = false;
+  }
+
+  if (!cont)
+    return;
+  if (!found.all()) {
+    context_.Say(GetContext().clauseSource,
+        "Every integer from 1 must appear in the %s clause"_err_en_US,
+        parser::omp::GetUpperName(clause, version));
+  }
+}
+
 void OmpStructureChecker::Enter(const parser::OmpClause::Looprange &x) {
   CheckAllowedClause(llvm::omp::Clause::OMPC_looprange);
   auto &[first, count]{x.v.t};
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 431c41f443f7a..179581469e5c9 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -5785,7 +5785,6 @@ CHECK_SIMPLE_CLAUSE(OmpxAttribute, OMPC_ompx_attribute)
 CHECK_SIMPLE_CLAUSE(Order, OMPC_order)
 CHECK_SIMPLE_CLAUSE(Otherwise, OMPC_otherwise)
 CHECK_SIMPLE_CLAUSE(Partial, OMPC_partial)
-CHECK_SIMPLE_CLAUSE(Permutation, OMPC_permutation)
 CHECK_SIMPLE_CLAUSE(ProcBind, OMPC_proc_bind)
 CHECK_SIMPLE_CLAUSE(Read, OMPC_read)
 CHECK_SIMPLE_CLAUSE(Relaxed, OMPC_relaxed)
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index cd98334ad8662..dc84c9d9ae9d8 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -326,7 +326,6 @@ class OmpStructureChecker : public OmpStructureCheckerBase {
   void CheckAtomicUpdate(const parser::OpenMPAtomicConstruct &x);
 
   void CheckScanModifier(const parser::OmpClause::Reduction &x);
-  void CheckLooprangeBounds(const parser::OpenMPLoopConstruct &x);
   void CheckDistLinear(const parser::OpenMPLoopConstruct &x);
   void CheckSIMDNest(const parser::OpenMPConstruct &x);
   void CheckNestedConstruct(const parser::OpenMPLoopConstruct &x);
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 78f763c27e757..8abf008a72147 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -34,6 +34,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 
+#include <cinttypes>
 #include <optional>
 #include <string>
 #include <tuple>
@@ -531,6 +532,70 @@ MaybeExpr MakeEvaluateExpr(const parser::OmpStylizedInstance &inp) {
       instance.u);
 }
 
+parser::Message &Reason::AttachTo(parser::Message &msg) {
+  msgs.AttachTo(msg);
+  return msg;
+}
+
+std::pair<std::optional<int64_t>, Reason> GetArgumentValueWithReason(
+    const parser::OmpDirectiveSpecification &spec, llvm::omp::Clause clauseId,
+    unsigned version) {
+  if (auto *clause{parser::omp::FindClause(spec, clauseId)}) {
+    if (auto *expr{parser::Unwrap<parser::Expr>(clause->u)}) {
+      if (auto value{GetIntValue(*expr)}) {
+        std::string name{GetUpperName(clauseId, version)};
+        Reason reason;
+        reason.Say(clause->source,
+            "%s clause was specified with argument %" PRId64 ""_because_en_US,
+            name.c_str(), *value);
+        return {*value, std::move(reason)};
+      }
+    }
+  }
+  return {std::nullopt, Reason()};
+}
+
+template <typename T>
+static std::pair<std::optional<int64_t>, Reason>
+GetNumArgumentsWithReasonForType(
+    const parser::OmpClause &clause, const std::string &name) {
+  if (auto *args{parser::Unwrap<std::list<T>>(clause.u)}) {
+    auto num{static_cast<int64_t>(args->size())};
+    Reason reason;
+    reason.Say(clause.source,
+        "%s clause was specified with %" PRId64 " arguments"_because_en_US,
+        name.c_str(), num);
+    return {num, std::move(reason)};
+  }
+  return {std::nullopt, Reason()};
+}
+
+std::pair<std::optional<int64_t>, Reason> GetNumArgumentsWithReason(
+    const parser::OmpDirectiveSpecification &spec, llvm::omp::Clause clauseId,
+    unsigned version) {
+  if (auto *clause{parser::omp::FindClause(spec, clauseId)}) {
+    std::string name{GetUpperName(clauseId, version)};
+    // Try the types used for list items.
+    {
+      using Ty = parser::ScalarIntExpr;
+      if (auto [num, reason]{
+              GetNumArgumentsWithReasonForType<Ty>(*clause, name)};
+          num) {
+        return {num, std::move(reason)};
+      }
+    }
+    {
+      using Ty = parser::ScalarIntConstantExpr;
+      if (auto [num, reason]{
+              GetNumArgumentsWithReasonForType<Ty>(*clause, name)};
+          num) {
+        return {num, std::move(reason)};
+      }
+    }
+  }
+  return {std::nullopt, Reason()};
+}
+
 bool IsLoopTransforming(llvm::omp::Directive dir) {
   switch (dir) {
   // TODO case llvm::omp::Directive::OMPD_flatten:
@@ -723,20 +788,156 @@ bool IsTransformableLoop(const parser::ExecutionPartConstruct &epc) {
   return false;
 }
 
-LoopSequence::LoopSequence(
-    const parser::ExecutionPartConstruct &root, bool allowAllLoops)
-    : allowAllLoops_(allowAllLoops) {
+// Return the depth of the affected nests:
+//   {affected-depth, must-be-perfect-nest}.
+std::tuple<std::optional<int64_t>, bool, Reason> GetAffectedNestDepthWithReason(
+    const parser::OpenMPLoopConstruct &x, unsigned version) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  llvm::omp::Directive dir{beginSpec.DirId()};
+  bool allowsCollapse{llvm::omp::isAllowedClauseForDirective(
+      dir, llvm::omp::Clause::OMPC_collapse, version)};
+  bool allowsOrdered{llvm::omp::isAllowedClauseForDirective(
+      dir, llvm::omp::Clause::OMPC_ordered, version)};
+
+  if (allowsCollapse || allowsOrdered) {
+    auto [count, reason]{GetArgumentValueWithReason(
+        beginSpec, llvm::omp::Clause::OMPC_collapse, version)};
+    auto [vo, ro]{GetArgumentValueWithReason(
+        beginSpec, llvm::omp::Clause::OMPC_ordered, version)};
+    if (vo) {
+      if (!count || *count < *vo) {
+        count = vo;
+        reason = std::move(ro);
+      }
+    }
+    return {count, true, std::move(reason)};
+  }
+
+  if (IsLoopTransforming(dir)) {
+    switch (dir) {
+    case llvm::omp::Directive::OMPD_interchange: {
+      // Get the length of the argument list to PERMUTATION.
+      if (parser::omp::FindClause(
+              beginSpec, llvm::omp::Clause::OMPC_permutation)) {
+        auto [num, reason]{GetNumArgumentsWithReason(
+            beginSpec, llvm::omp::Clause::OMPC_permutation, version)};
+        return {num, true, std::move(reason)};
+      }
+      // PERMUTATION not specified, assume PERMUTATION(2, 1).
+      std::string name{parser::omp::GetUpperName(
+          llvm::omp::Clause::OMPC_permutation, version)};
+      Reason reason;
+      reason.Say(beginSpec.source,
+          "%s clause was not specified, %s(2, 1) was assumed"_because_en_US,
+          name.c_str(), name.c_str());
+      return {2, true, std::move(reason)};
+    }
+    case llvm::omp::Directive::OMPD_stripe:
+    case llvm::omp::Directive::OMPD_tile: {
+      // Get the length of the argument list to SIZES.
+      auto [num, reason]{GetNumArgumentsWithReason(
+          beginSpec, llvm::omp::Clause::OMPC_sizes, version)};
+      return {num, true, std::move(reason)};
+    }
+    case llvm::omp::Directive::OMPD_fuse: {
+      // Get the value from the argument to DEPTH.
+      if (parser::omp::FindClause(beginSpec, llvm::omp::Clause::OMPC_depth)) {
+        auto [count, reason]{GetArgumentValueWithReason(
+            beginSpec, llvm::omp::Clause::OMPC_depth, version)};
+        return {count, true, std::move(reason)};
+      }
+      std::string name{
+          parser::omp::GetUpperName(llvm::omp::Clause::OMPC_depth, version)};
+      Reason reason;
+      reason.Say(beginSpec.source,
+          "%s clause was not specified, a value of 1 was assumed"_because_en_US,
+          name.c_str());
+      return {1, true, std::move(reason)};
+    }
+    case llvm::omp::Directive::OMPD_reverse:
+    case llvm::omp::Directive::OMPD_unroll:
+      return {1, false, Reason()};
+    // TODO: case llvm::omp::Directive::OMPD_flatten:
+    // TODO: case llvm::omp::Directive::OMPD_split:
+    default:
+      break;
+    }
+  }
+
+  return {std::nullopt, false, Reason()};
+}
+
+// Return the range of the affected nests in the sequence:
+//   {first, count, std::move(reason)}.
+std::tuple<std::optional<int64_t>, std::optional<int64_t>, Reason>
+GetAffectedLoopRangeWithReason(
+    const parser::OpenMPLoopConstruct &x, unsigned version) {
+  const parser::OmpDirectiveSpecification &beginSpec{x.BeginDir()};
+  llvm::omp::Directive dir{beginSpec.DirId()};
+
+  if (dir == llvm::omp::Directive::OMPD_fuse) {
+    std::string name{GetUpperName(llvm::omp::Clause::OMPC_looprange, version)};
+    if (auto *clause{parser::omp::FindClause(
+            beginSpec, llvm::omp::Clause::OMPC_looprange)}) {
+      auto &range{DEREF(parser::Unwrap<parser::OmpLooprangeClause>(clause->u))};
+      std::optional<int64_t> first{GetIntValue(std::get<0>(range.t))};
+      std::optional<int64_t> count{GetIntValue(std::get<1>(range.t))};
+      if (!first || !count || *first <= 0 || *count <= 0) {
+        return {std::nullopt, std::nullopt, Reason()};
+      }
+      std::string name{parser::omp::GetUpperName(
+          llvm::omp::Clause::OMPC_looprange, version)};
+      Reason reason;
+      reason.Say(clause->source,
+          "%s clause was specified with a count of %" PRId64
+          " starting at loop %" PRId64 ""_because_en_US,
+          name.c_str(), *count, *first);
+      return {*first, *count, std::move(reason)};
+    }
+    // If LOOPRANGE was not found, return {1, -1}, where -1 means "the whole
+    // associated sequence".
+    Reason reason;
+    reason.Say(x.source,
+        "%s clause was not specified, a value of 1 was assumed"_because_en_US,
+        name.c_str());
+    return {1, -1, std::move(reason)};
+  }
+
+  assert(llvm::omp::getDirectiveAssociation(dir) ==
+          llvm::omp::Association::LoopNest &&
+      "Expecting loop-nest-associated construct");
+  // For loop-nest constructs, a single loop-nest is affected.
+  return {1, 1, Reason()};
+}
+
+std::optional<int64_t> GetRequiredCount(
+    std::optional<int64_t> first, std::optional<int64_t> count) {
+  if (first && count && *first > 0) {
+    if (*count > 0) {
+      return *first + *count - 1;
+    } else if (*count == -1) {
+      return -1;
+    }
+  }
+  return std::nullopt;
+}
+
+LoopSequence::LoopSequence(const parser::ExecutionPartConstruct &root,
+    unsigned version, bool allowAllLoops)
+    : version_(version), allowAllLoops_(allowAllLoops) {
   entry_ = createConstructEntry(root);
   assert(entry_ && "Expecting loop like code");
 
   createChildrenFromRange(entry_->location);
-  length_ = calculateLength();
+  precalculate();
 }
 
-LoopSequence::LoopSequence(std::unique_ptr<Construct> entry, bool allowAllLoops)
-    : allowAllLoops_(allowAllLoops), entry_(std::move(entry)) {
+LoopSequence::LoopSequence(
+    std::unique_ptr<Construct> entry, unsigned version, bool allowAllLoops)
+    : version_(version), allowAllLoops_(allowAllLoops),
+      entry_(std::move(entry)) {
   createChildrenFromRange(entry_->location);
-  length_ = calculateLength();
+  precalculate();
 }
 
 std::unique_ptr<LoopSequence::Construct> LoopSequence::createConstructEntry(
@@ -759,16 +960,33 @@ std::unique_ptr<LoopSequence::Construct> LoopSequence::createConstructEntry(
 void LoopSequence::createChildrenFromRange(
     ExecutionPartIterator::IteratorType begin,
     ExecutionPartIterator::IteratorType end) {
+  // Create children. If there is zero or one, this LoopSequence could be
+  // a nest. If there are more, it could be a proper sequence. In the latter
+  // case any code between consecutive children must be "transparent".
   for (auto &code : BlockRange(begin, end, BlockRange::Step::Over)) {
     if (auto entry{createConstructEntry(code)}) {
-      children_.push_back(LoopSequence(std::move(entry), allowAllLoops_));
+      children_.push_back(
+          LoopSequence(std::move(entry), version_, allowAllLoops_));
+      if (!IsTransformableLoop(code)) {
+        hasInvalidIC_ = true;
+        hasOpaqueIC_ = true;
+      }
+    } else {
+      hasInvalidIC_ = hasInvalidIC_ || !IsValidInterveningCode(code);
+      hasOpaqueIC_ = hasOpaqueIC_ || !IsTransparentInterveningCode(code);
     }
   }
 }
 
+void LoopSequence::precalculate() {
+  // Calculate length before depths.
+  length_ = calculateLength();
+  depth_ = calculateDepths();
+}
+
 std::optional<int64_t> LoopSequence::calculateLength() const {
   if (!entry_->owner) {
-    return sumOfChildrenLengths();
+    return getNestedLength();
   }
   if (parser::Unwrap<parser::DoConstruct>(entry_->owner)) {
     return 1;
@@ -786,7 +1004,7 @@ std::optional<int64_t> LoopSequence::calculateLength() const {
     return std::nullopt;
   }
 
-  auto nestedCount{sumOfChildrenLengths()};
+  auto nestedLength{getNestedLength()};
 
   if (dir == llvm::omp::Directive::OMPD_fuse) {
     // If there are no loops nested inside of FUSE, then the construct is
@@ -799,7 +1017,7 @@ std::optional<int64_t> LoopSequence::calculateLength() const {
     //   !$omp do         ! error: this should contain a loop (superfluous)
     //   !$omp fuse       ! error: this should contain a loop
     //   !$omp end fuse
-    if (!nestedCount || *nestedCount == 0) {
+    if (!nestedLength || *nestedLength == 0) {
       return std::nullopt;
     }
     auto *clause{
@@ -813,21 +1031,21 @@ std::optional<int64_t> LoopSequence::calculateLength() const {
     if (!count || *count <= 0) {
       return std::nullopt;
     }
-    if (*count <= *nestedCount) {
-      return 1 + *nestedCount - *count;
+    if (*count <= *nestedLength) {
+      return 1 + *nestedLength - *count;
     }
     return std::nullopt;
   }
 
   if (dir == llvm::omp::Directive::OMPD_nothing) {
-    return nestedCount;
+    return nestedLength;
   }
 
   // For every other loop construct return 1.
   return 1;
 }
 
-std::optional<int64_t> LoopSequence::sumOfChildrenLengths() const {
+std::optional<int64_t> LoopSequence::getNestedLength() const {
   int64_t sum{0};
   for (auto &seq : children_) {
     if (auto len{seq.length()}) {
@@ -838,4 +1056,125 @@ std::optional<int64_t> LoopSequence::sumOfChildrenLengths() const {
   }
   return sum;
 }
+
+LoopSequence::Depth LoopSequence::calculateDepths() const {
+  auto plus{[](std::optional<int64_t> a,
+                std::optional<int64_t> b) -> std::optional<int64_t> {
+    if (a && b) {
+      return *a + *b;
+    }
+    return std::nullopt;
+  }};
+
+  // The sequence length is calculated first, so we already know if this
+  // sequence is a nest or not.
+  if (!isNest()) {
+    return Depth{0, 0};
+  }
+
+  // Get the length of the nested sequence. The hasInvalidIC_ and hasOpaqueIC_
+  // flags do not count canonical loop nests, but there can only be one for
+  // depth to make sense.
+  std::optional<int64_t> length{getNestedLength()};
+  // Get the depths of the code nested in this sequence (e.g. contained in
+  // entry_), and use it as the basis for the depths of entry_->owner.
+  auto [semaDepth, perfDepth]{getNestedDepths()};
+  if (hasInvalidIC_ || length.value_or(0) != 1) {
+    semaDepth = perfDepth = 0;
+  } else if (hasOpaqueIC_ || length.value_or(0) != 1) {
+    perfDepth = 0;
+  }
+
+  if (!entry_->owner) {
+    return Depth{semaDepth, perfDepth};
+  }
+  if (parser::Unwrap<parser::DoConstruct>(entry_->owner)) {
+    return Depth{plus(1, semaDepth), plus(1, perfDepth)};
+  }
+
+  auto &omp{DEREF(parser::Unwrap<parser::OpenMPLoopConstruct>(*entry_->owner))};
+  const parser::OmpDirectiveSpecification &beginSpec{omp.BeginDir()};
+  llvm::omp::Directive dir{beginSpec.DirId()};
+  if (!IsTransformableLoop(omp)) {
+    return Depth{0, 0};
+  }
+
+  switch (dir) {
+  // TODO: case llvm::omp::Directive::OMPD_split:
+  // TODO: case llvm::omp::Directive::OMPD_flatten:
+  case llvm::omp::Directive::OMPD_fuse:
+    if (auto *clause{parser::omp::FindClause(
+            beginSpec, llvm::omp::Clause::OMPC_depth)}) {
+      auto &expr{parser::UnwrapRef<parser::Expr>(clause->u)};
+      auto value{GetIntValue(expr)};
+      auto nestedLength{getNestedLength()};
+      // The result is a perfect nest only if all loop in the sequence
+      // are fused.
+      if (value && nestedLength) {
+        auto [first, count, _]{GetAffectedLoopRangeWithReason(omp, version_)};
+        if (auto required{GetRequiredCount(first, count)}) {
+          if (*required == -1 || *required == *nestedLength) {
+            return Depth{value, value};
+          }
+          return Depth{1, 1};
+        }
+      }
+      return Depth{std::nullopt, std::nullopt};
+    }
+    return Depth{1, 1};
+  case llvm::omp::Directive::OMPD_interchange:
+  case llvm::omp::Directive::OMPD_nothing:
+  case llvm::omp::Directive::OMPD_reverse:
+    return {semaDepth, perfDepth};
+  case llvm::omp::Directive::OMPD_stripe:
+  case llvm::omp::Directive::OMPD_tile:
+    // Look for SIZES clause.
+    if (auto *clause{parser::omp::FindClause(
+            beginSpec, llvm::omp::Clause::OMPC_sizes)}) {
+      // Return the number of arguments in the SIZES clause
+      size_t num{
+          parser::UnwrapRef<parser::OmpClause::Sizes>(clause->u).v.size()};
+      return Depth{plus(num, semaDepth), plus(num, perfDepth)};
+    }
+    // The SIZES clause is mandatory, if it's missing the result is unknown.
+    return {std::nullopt, std::nullopt};
+  case llvm::omp::Directive::OMPD_unroll:
+    if (IsFullUnroll(omp)) {
+      return Depth{0, 0};
+    }
+    // If this is not a full unroll then look for a PARTIAL clause.
+    if (auto *clause{parser::omp::FindClause(
+            beginSpec, llvm::omp::Clause::OMPC_partial)}) {
+      std::optional<int64_t> factor;
+      if (auto *expr{parser::Unwrap<parser::Expr>(clause->u)}) {
+        factor = GetIntValue(*expr);
+      }
+      // If it's a partial unroll, and the unroll count is 1, then this
+      // construct is a no-op.
+      if (factor && *factor == 1) {
+        return Depth{semaDepth, perfDepth};
+      }
+      // If it's a proper partial unroll, then the resulting loop cannot
+      // have either depth greater than 1: if it had a loop nested in it,
+      // then after unroll it will have at least two copies it it, making
+      // it a final loop.
+      return {1, 1};
+    }
+    return Depth{std::nullopt, std::nullopt};
+  default:
+    llvm_unreachable("Expecting loop-transforming construct");
+  }
+}
+
+LoopSequence::Depth LoopSequence::getNestedDepths() const {
+  if (length() != 1) {
+    return Depth{0, 0};
+  } else if (children_.empty()) {
+    assert(entry_->owner &&
+        parser::Unwrap<parser::DoConstruct>(entry_->owner) &&
+        "Expecting DO construct");
+    return Depth{0, 0};
+  }
+  return children_.front().depth_;
+}
 } // namespace Fortran::semantics::omp
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 0e74c21b73768..96f66246ac676 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -1123,7 +1123,6 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
 
   void CheckDataCopyingClause(
       const parser::Name &, const Symbol &, Symbol::Flag);
-  void CheckAssocLoopLevel(std::int64_t level, const parser::OmpClause *clause);
   void CheckObjectIsPrivatizable(
       const parser::Name &, const Symbol &, Symbol::Flag);
   void CheckSourceLabel(const parser::Label &);
@@ -2137,6 +2136,10 @@ static bool isSizesClause(const parser::OmpClause *clause) {
   return std::holds_alternative<parser::OmpClause::Sizes>(clause->u);
 }
 
+static bool isCollapseClause(const parser::OmpClause *clause) {
+  return std::holds_alternative<parser::OmpClause::Collapse>(clause->u);
+}
+
 std::int64_t OmpAttributeVisitor::SetAssociatedMaxClause(
     llvm::SmallVector<std::int64_t> &levels,
     llvm::SmallVector<const parser::OmpClause *> &clauses) {
@@ -2145,18 +2148,14 @@ std::int64_t OmpAttributeVisitor::SetAssociatedMaxClause(
   // does not exeed the number of tiled loops.
   std::int64_t tileLevel = 0;
   for (auto [level, clause] : llvm::zip_equal(levels, clauses))
-    if (isSizesClause(clause))
+    if (clause && isSizesClause(clause))
       tileLevel = level;
 
   std::int64_t maxLevel = 1;
   const parser::OmpClause *maxClause = nullptr;
   for (auto [level, clause] : llvm::zip_equal(levels, clauses)) {
-    if (tileLevel > 0 && tileLevel < level) {
-      context_.Say(clause->source,
-          "The value of the parameter in the COLLAPSE clause must"
-          " not be larger than the number of the number of tiled loops"
-          " because collapse currently is limited to independent loop"
-          " iterations."_err_en_US);
+    if (clause && isCollapseClause(clause) && tileLevel > 0 &&
+        tileLevel < level) {
       return 1;
     }
 
@@ -2187,6 +2186,14 @@ void OmpAttributeVisitor::CollectNumAffectedLoopsFromLoopConstruct(
 
   CollectNumAffectedLoopsFromClauses(clauseList, levels, clauses);
   CollectNumAffectedLoopsFromInnerLoopContruct(x, levels, clauses);
+
+  // OMPD_interchange with no permutation clause needs a level 2 nest
+  if (x.BeginDir().DirId() == llvm::omp::Directive::OMPD_interchange &&
+      !parser::omp::FindClause(
+          x.BeginDir(), llvm::omp::Clause::OMPC_permutation)) {
+    levels.push_back(2);
+    clauses.push_back(nullptr);
+  }
 }
 
 void OmpAttributeVisitor::CollectNumAffectedLoopsFromInnerLoopContruct(
@@ -2229,6 +2236,11 @@ void OmpAttributeVisitor::CollectNumAffectedLoopsFromClauses(
       levels.push_back(tclause->v.size());
       clauses.push_back(&clause);
     }
+    if (const auto iclause{
+            std::get_if<parser::OmpClause::Permutation>(&clause.u)}) {
+      levels.push_back(iclause->v.size());
+      clauses.push_back(&clause);
+    }
   }
 }
 
@@ -2292,41 +2304,14 @@ void OmpAttributeVisitor::CheckPerfectNestAndRectangularLoop(
         // Recurse into nested loop
         const auto &block{std::get<parser::Block>(loop->t)};
         if (block.empty()) {
-          // Insufficient number of nested loops already reported by
-          // CheckAssocLoopLevel()
           break;
         }
 
         loop = GetDoConstructIf(block.front());
         if (!loop) {
-          // Insufficient number of nested loops already reported by
-          // CheckAssocLoopLevel()
           break;
         }
 
-        auto checkPerfectNest = [&, this]() {
-          if (block.empty())
-            return;
-          auto last = block.end();
-          --last;
-
-          // A trailing CONTINUE is not considered part of the loop body
-          if (parser::Unwrap<parser::ContinueStmt>(*last))
-            --last;
-
-          // In a perfectly nested loop, the nested loop must be the only
-          // statement
-          if (last == block.begin())
-            return;
-
-          // Non-perfectly nested loop
-          // TODO: Point to non-DO statement, directiveSource as a note
-          context_.Say(dirContext.directiveSource,
-              "Canonical loop nest must be perfectly nested."_err_en_US);
-        };
-
-        checkPerfectNest();
-
         ++curLevel;
       }
     }
@@ -2400,25 +2385,6 @@ void OmpAttributeVisitor::PrivatizeAssociatedLoopIndexAndCheckLoopLevel(
           loop = it != block.end() ? GetDoConstructIf(*it) : nullptr;
         }
       }
-      CheckAssocLoopLevel(level, GetAssociatedClause());
-    }
-  }
-}
-
-void OmpAttributeVisitor::CheckAssocLoopLevel(
-    std::int64_t level, const parser::OmpClause *clause) {
-  if (clause && level != 0) {
-    switch (clause->Id()) {
-    case llvm::omp::OMPC_sizes:
-      context_.Say(clause->source,
-          "The SIZES clause has more entries than there are nested canonical loops."_err_en_US);
-      break;
-    default:
-      context_.Say(clause->source,
-          "The value of the parameter in the COLLAPSE or ORDERED clause must"
-          " not be larger than the number of nested loops"
-          " following the construct."_err_en_US);
-      break;
     }
   }
 }
diff --git a/flang/test/Fir/OpenACC/use-device-canonicalizer.mlir b/flang/test/Fir/OpenACC/use-device-canonicalizer.mlir
index 6ec583dd4fe22..7baff8bd15384 100644
--- a/flang/test/Fir/OpenACC/use-device-canonicalizer.mlir
+++ b/flang/test/Fir/OpenACC/use-device-canonicalizer.mlir
@@ -94,3 +94,36 @@ func.func @test_host_data_hoisting_ref_to_box() {
   return
 }
 
+// -----
+
+// Test single use_device used by multiple host_data: one use_device is created
+// per host_data, each inserted right before its host_data.
+func.func @multiple_host_data_(%arg0: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "arr"}) {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fir.declare %arg0 dummy_scope %0 arg 1 {uniq_name = "_QFmultiple_host_dataEarr"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+  %2 = fir.rebox %1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+  %3 = acc.use_device var(%2 : !fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>> {name = "arr"}
+  acc.host_data dataOperands(%3 : !fir.box<!fir.array<?xf32>>) {
+    %4 = fir.dummy_scope : !fir.dscope
+    %5 = fir.declare %3 dummy_scope %4 arg 1 {uniq_name = "_QFmultiple_host_dataEarr"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+    acc.terminator
+  }
+  acc.host_data dataOperands(%3 : !fir.box<!fir.array<?xf32>>) {
+    %4 = fir.dummy_scope : !fir.dscope
+    %5 = fir.declare %3 dummy_scope %4 arg 1 {uniq_name = "_QFmultiple_host_dataEarr"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+    acc.terminator
+  }
+  return
+}
+// CHECK-LABEL: func.func @multiple_host_data_
+// CHECK-SAME: %arg0: !fir.box<!fir.array<?xf32>>
+// CHECK: fir.box_addr %2
+// CHECK: fir.box_addr %2
+// CHECK: %[[USE_DEVICE_1:.*]] = acc.use_device varPtr({{.*}} : !fir.ref<!fir.array<?xf32>>) varType(!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr"}
+// CHECK: acc.host_data dataOperands(%[[USE_DEVICE_1]] : !fir.ref<!fir.array<?xf32>>) {
+// CHECK: acc.terminator
+// CHECK: }
+// CHECK: %[[USE_DEVICE_2:.*]] = acc.use_device varPtr({{.*}} : !fir.ref<!fir.array<?xf32>>) varType(!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>> {name = "arr"}
+// CHECK: acc.host_data dataOperands(%[[USE_DEVICE_2]] : !fir.ref<!fir.array<?xf32>>) {
+// CHECK: acc.terminator
+// CHECK: }
diff --git a/flang/test/Lower/Intrinsics/iall.f90 b/flang/test/Lower/Intrinsics/iall.f90
index a1d320ea8b1a2..200a7f7eac8e5 100644
--- a/flang/test/Lower/Intrinsics/iall.f90
+++ b/flang/test/Lower/Intrinsics/iall.f90
@@ -1,156 +1,215 @@
-! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func @_QPiall_test_1(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi8>>{{.*}}) -> i8 {
+! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi8>>{{.*}}) -> i8 {
 integer(1) function iall_test_1(a)
 integer(1) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK-DAG:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-DAG:  %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1 {{.*}} : (!fir.box<!fir.array<?xi8>>, !fir.dscope) -> (!fir.box<!fir.array<?xi8>>, !fir.box<!fir.array<?xi8>>)
+! CHECK-DAG:  %[[RES_ALLOCA:.*]] = fir.alloca i8
+! CHECK-DAG:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {{.*}} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+! CHECK-DAG:  %[[ABSENT:.*]] = fir.absent !fir.box<i1>
+! CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+! CHECK-DAG:  %[[CONV_ARG:.*]] = fir.convert %[[ARG_DECL]]#1 : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
+! CHECK-DAG:  %[[CONV_C0:.*]] = fir.convert %[[C0]] : (index) -> i32
+! CHECK-DAG:  %[[CONV_ABSENT:.*]] = fir.convert %[[ABSENT]] : (!fir.box<i1>) -> !fir.box<none>
 iall_test_1 = iall(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAll1(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i8
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll1(%[[CONV_ARG]], %{{.*}}, %{{.*}}, %[[CONV_C0]], %[[CONV_ABSENT]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i8
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i8, !fir.ref<i8>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test_2(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi16>>{{.*}}) -> i16 {
+! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi16>>{{.*}}) -> i16 {
 integer(2) function iall_test_2(a)
 integer(2) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK-DAG:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-DAG:  %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1 {{.*}} : (!fir.box<!fir.array<?xi16>>, !fir.dscope) -> (!fir.box<!fir.array<?xi16>>, !fir.box<!fir.array<?xi16>>)
+! CHECK-DAG:  %[[RES_ALLOCA:.*]] = fir.alloca i16
+! CHECK-DAG:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {{.*}} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+! CHECK-DAG:  %[[ABSENT:.*]] = fir.absent !fir.box<i1>
+! CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+! CHECK-DAG:  %[[CONV_ARG:.*]] = fir.convert %[[ARG_DECL]]#1 : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
+! CHECK-DAG:  %[[CONV_C0:.*]] = fir.convert %[[C0]] : (index) -> i32
+! CHECK-DAG:  %[[CONV_ABSENT:.*]] = fir.convert %[[ABSENT]] : (!fir.box<i1>) -> !fir.box<none>
 iall_test_2 = iall(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAll2(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i16
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll2(%[[CONV_ARG]], %{{.*}}, %{{.*}}, %[[CONV_C0]], %[[CONV_ABSENT]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i16
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i16, !fir.ref<i16>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test_4(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) -> i32 {
+! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) -> i32 {
 integer function iall_test_4(a)
 integer :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK-DAG:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-DAG:  %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1 {{.*}} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK-DAG:  %[[RES_ALLOCA:.*]] = fir.alloca i32
+! CHECK-DAG:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK-DAG:  %[[ABSENT:.*]] = fir.absent !fir.box<i1>
+! CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+! CHECK-DAG:  %[[CONV_ARG:.*]] = fir.convert %[[ARG_DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK-DAG:  %[[CONV_C0:.*]] = fir.convert %[[C0]] : (index) -> i32
+! CHECK-DAG:  %[[CONV_ABSENT:.*]] = fir.convert %[[ABSENT]] : (!fir.box<i1>) -> !fir.box<none>
 iall_test_4 = iall(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAll4(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll4(%[[CONV_ARG]], %{{.*}}, %{{.*}}, %[[CONV_C0]], %[[CONV_ABSENT]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i32, !fir.ref<i32>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test_8(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi64>>{{.*}}) -> i64 {
+! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi64>>{{.*}}) -> i64 {
 integer(8) function iall_test_8(a)
 integer(8) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK-DAG:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-DAG:  %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1 {{.*}} : (!fir.box<!fir.array<?xi64>>, !fir.dscope) -> (!fir.box<!fir.array<?xi64>>, !fir.box<!fir.array<?xi64>>)
+! CHECK-DAG:  %[[RES_ALLOCA:.*]] = fir.alloca i64
+! CHECK-DAG:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {{.*}} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK-DAG:  %[[ABSENT:.*]] = fir.absent !fir.box<i1>
+! CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+! CHECK-DAG:  %[[CONV_ARG:.*]] = fir.convert %[[ARG_DECL]]#1 : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
+! CHECK-DAG:  %[[CONV_C0:.*]] = fir.convert %[[C0]] : (index) -> i32
+! CHECK-DAG:  %[[CONV_ABSENT:.*]] = fir.convert %[[ABSENT]] : (!fir.box<i1>) -> !fir.box<none>
 iall_test_8 = iall(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAll8(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i64
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll8(%[[CONV_ARG]], %{{.*}}, %{{.*}}, %[[CONV_C0]], %[[CONV_ABSENT]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i64
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i64, !fir.ref<i64>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test_16(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi128>>{{.*}}) -> i128 {
+! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?xi128>>{{.*}}) -> i128 {
 integer(16) function iall_test_16(a)
 integer(16) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK-DAG:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-DAG:  %[[ARG_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1 {{.*}} : (!fir.box<!fir.array<?xi128>>, !fir.dscope) -> (!fir.box<!fir.array<?xi128>>, !fir.box<!fir.array<?xi128>>)
+! CHECK-DAG:  %[[RES_ALLOCA:.*]] = fir.alloca i128
+! CHECK-DAG:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]] {{.*}} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+! CHECK-DAG:  %[[ABSENT:.*]] = fir.absent !fir.box<i1>
+! CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
+! CHECK-DAG:  %[[CONV_ARG:.*]] = fir.convert %[[ARG_DECL]]#1 : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
+! CHECK-DAG:  %[[CONV_C0:.*]] = fir.convert %[[C0]] : (index) -> i32
+! CHECK-DAG:  %[[CONV_ABSENT:.*]] = fir.convert %[[ABSENT]] : (!fir.box<i1>) -> !fir.box<none>
 iall_test_16 = iall(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAll16(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i128
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll16(%[[CONV_ARG]], %{{.*}}, %{{.*}}, %[[CONV_C0]], %[[CONV_ABSENT]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i128
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i128, !fir.ref<i128>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test2(
-! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?xi32>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) {
+! CHECK-SAME: %[[ARG0:.*]]: !fir.box<!fir.array<?x?xi32>>{{.*}}, %[[ARG1:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) {
 subroutine iall_test2(a,r)
 integer :: a(:,:)
 integer :: r(:)
-! CHECK-DAG:  %[[c2_i32:.*]] = arith.constant 2 : i32
-! CHECK-DAG:  %[[a0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK-DAG:  %[[BOX_ALLOCA:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
+! CHECK-DAG:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK-DAG:  %[[ARG0_DECL:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %[[SCOPE]] arg 1 {{.*}} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?x?xi32>>, !fir.box<!fir.array<?x?xi32>>)
+! CHECK-DAG:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %[[SCOPE]] arg 2 {{.*}} : (!fir.box<!fir.array<?xi32>>, !fir.dscope) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK-DAG:  %[[C2:.*]] = arith.constant 2 : i32
+! CHECK-DAG:  %[[ABSENT:.*]] = fir.absent !fir.box<i1>
+! CHECK-DAG:  %[[CONV_BOX:.*]] = fir.convert %[[BOX_ALLOCA]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
+! CHECK-DAG:  %[[CONV_ARG0:.*]] = fir.convert %[[ARG0_DECL]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
+! CHECK-DAG:  %[[CONV_ABSENT:.*]] = fir.convert %[[ABSENT]] : (!fir.box<i1>) -> !fir.box<none>
 r = iall(a,dim=2)
-! CHECK:  fir.call @_FortranAIAllDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
-! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
-! CHECK-DAG:  fir.freemem %[[a13]]
+! CHECK:  fir.call @_FortranAIAllDim(%[[CONV_BOX]], %[[CONV_ARG0]], %[[C2]], %{{.*}}, %{{.*}}, %[[CONV_ABSENT]]) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
+! CHECK:  %[[BOX_LOAD:.*]] = fir.load %[[BOX_ALLOCA]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK:  %[[ADDR:.*]] = fir.box_addr %[[BOX_LOAD]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+! CHECK:  %[[TMP_DECL:.*]]:2 = hlfir.declare %[[ADDR]]({{.*}}) {uniq_name = ".tmp.intrinsic_result"} : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.heap<!fir.array<?xi32>>)
+! CHECK:  %[[EXPR:.*]] = hlfir.as_expr %[[TMP_DECL]]#0 move %{{.*}} : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
+! CHECK:  hlfir.assign %[[EXPR]] to %[[ARG1_DECL]]#0 : !hlfir.expr<?xi32>, !fir.box<!fir.array<?xi32>>
+! CHECK:  hlfir.destroy %[[EXPR]] : !hlfir.expr<?xi32>
 end subroutine
 
 ! CHECK-LABEL: func @_QPiall_test_optional(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>
+! CHECK-SAME: %[[MASK:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>{{.*}}, %[[X:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) -> i32 {
 integer function iall_test_optional(mask, x)
 integer :: x(:)
 logical, optional :: mask(:)
+! CHECK:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:  %[[RES_ALLOCA:.*]] = fir.alloca i32
+! CHECK:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]]
+! CHECK:  %[[MASK_DECL:.*]]:2 = hlfir.declare %[[MASK]] dummy_scope %[[SCOPE]] arg 1 {fortran_attrs = #fir.var_attrs<optional>, {{.*}}}
+! CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %[[SCOPE]] arg 2 {{.*}}
 iall_test_optional = iall(x, mask=mask)
-! CHECK:  %[[VAL_9:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAll4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_9]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  %[[CONV_X:.*]] = fir.convert %[[X_DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK:  %[[CONV_MASK:.*]] = fir.convert %[[MASK_DECL]]#1 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll4(%[[CONV_X]], %{{.*}}, %{{.*}}, %{{.*}}, %[[CONV_MASK]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i32, !fir.ref<i32>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test_optional_2(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+! CHECK-SAME: %[[MASK:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>{{.*}}, %[[X:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) -> i32 {
 integer function iall_test_optional_2(mask, x)
 integer :: x(:)
 logical, pointer :: mask(:)
+! CHECK:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:  %[[RES_ALLOCA:.*]] = fir.alloca i32
+! CHECK:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]]
+! CHECK:  %[[MASK_DECL:.*]]:2 = hlfir.declare %[[MASK]] dummy_scope %[[SCOPE]] arg 1 {fortran_attrs = #fir.var_attrs<pointer>, {{.*}}}
+! CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %[[SCOPE]] arg 2 {{.*}}
 iall_test_optional_2 = iall(x, mask=mask)
-! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
-! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.ptr<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<!fir.array<?x!fir.logical<4>>>) -> i64
-! CHECK:  %[[VAL_7:.*]] = arith.constant 0 : i64
-! CHECK:  %[[VAL_8:.*]] = arith.cmpi ne, %[[VAL_6]], %[[VAL_7]] : i64
-! CHECK:  %[[VAL_9:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
-! CHECK:  %[[VAL_10:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_11:.*]] = arith.select %[[VAL_8]], %[[VAL_9]], %[[VAL_10]] : !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAll4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_18]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  %[[MASK_LOAD1:.*]] = fir.load %[[MASK_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:  %[[MASK_ADDR:.*]] = fir.box_addr %[[MASK_LOAD1]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.ptr<!fir.array<?x!fir.logical<4>>>
+! CHECK:  %[[MASK_PTR_CONV:.*]] = fir.convert %[[MASK_ADDR]] : (!fir.ptr<!fir.array<?x!fir.logical<4>>>) -> i64
+! CHECK:  %[[C0_I64:.*]] = arith.constant 0 : i64
+! CHECK:  %[[IS_PRESENT:.*]] = arith.cmpi ne, %[[MASK_PTR_CONV]], %[[C0_I64]] : i64
+! CHECK:  %[[MASK_LOAD2:.*]] = fir.load %[[MASK_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:  %[[ABSENT:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
+! CHECK:  %[[SELECTED:.*]] = arith.select %[[IS_PRESENT]], %[[MASK_LOAD2]], %[[ABSENT]] : !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
+! CHECK:  %[[CONV_X:.*]] = fir.convert %[[X_DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK:  %[[CONV_MASK:.*]] = fir.convert %[[SELECTED]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.box<none>
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll4(%[[CONV_X]], %{{.*}}, %{{.*}}, %{{.*}}, %[[CONV_MASK]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i32, !fir.ref<i32>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test_optional_3(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.array<10x!fir.logical<4>>>
+! CHECK-SAME: %[[MASK:.*]]: !fir.ref<!fir.array<10x!fir.logical<4>>>{{.*}}, %[[X:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) -> i32 {
 integer function iall_test_optional_3(mask, x)
 integer :: x(:)
 logical, optional :: mask(10)
+! CHECK:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:  %[[RES_ALLOCA:.*]] = fir.alloca i32
+! CHECK:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]]
+! CHECK:  %[[MASK_DECL:.*]]:2 = hlfir.declare %[[MASK]]({{.*}}) dummy_scope %[[SCOPE]] arg 1 {fortran_attrs = #fir.var_attrs<optional>, {{.*}}}
+! CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %[[SCOPE]] arg 2 {{.*}}
 iall_test_optional_3 = iall(x, mask=mask)
-! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_5:.*]] = fir.is_present %[[VAL_0]] : (!fir.ref<!fir.array<10x!fir.logical<4>>>) -> i1
-! CHECK:  %[[VAL_6:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_7:.*]] = fir.embox %[[VAL_0]](%[[VAL_6]]) : (!fir.ref<!fir.array<10x!fir.logical<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.logical<4>>>
-! CHECK:  %[[VAL_8:.*]] = fir.absent !fir.box<!fir.array<10x!fir.logical<4>>>
-! CHECK:  %[[VAL_9:.*]] = arith.select %[[VAL_5]], %[[VAL_7]], %[[VAL_8]] : !fir.box<!fir.array<10x!fir.logical<4>>>
-! CHECK:  %[[VAL_18:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<10x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAll4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_18]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  %[[IS_PRESENT:.*]] = fir.is_present %[[MASK_DECL]]#0 : (!fir.ref<!fir.array<10x!fir.logical<4>>>) -> i1
+! CHECK:  %[[EMBOX:.*]] = fir.embox %[[MASK_DECL]]#0(%{{.*}}) : (!fir.ref<!fir.array<10x!fir.logical<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.logical<4>>>
+! CHECK:  %[[ABSENT:.*]] = fir.absent !fir.box<!fir.array<10x!fir.logical<4>>>
+! CHECK:  %[[SELECTED:.*]] = arith.select %[[IS_PRESENT]], %[[EMBOX]], %[[ABSENT]] : !fir.box<!fir.array<10x!fir.logical<4>>>
+! CHECK:  %[[CONV_X:.*]] = fir.convert %[[X_DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK:  %[[CONV_MASK:.*]] = fir.convert %[[SELECTED]] : (!fir.box<!fir.array<10x!fir.logical<4>>>) -> !fir.box<none>
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll4(%[[CONV_X]], %{{.*}}, %{{.*}}, %{{.*}}, %[[CONV_MASK]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i32, !fir.ref<i32>
 end function
 
 ! CHECK-LABEL: func @_QPiall_test_optional_4(
+! CHECK-SAME: %[[X:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}, %[[USE_MASK:.*]]: !fir.ref<!fir.logical<4>>{{.*}}) -> i32 {
 integer function iall_test_optional_4(x, use_mask)
 ! Test that local allocatable tracked in local variables
 ! are dealt as optional argument correctly.
 integer :: x(:)
 logical :: use_mask
 logical, allocatable :: mask(:)
+! CHECK:  %[[SCOPE:.*]] = fir.dummy_scope : !fir.dscope
+! CHECK:  %[[RES_ALLOCA:.*]] = fir.alloca i32
+! CHECK:  %[[RES_DECL:.*]]:2 = hlfir.declare %[[RES_ALLOCA]]
+! CHECK:  %[[MASK_ALLOCA:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>
+! CHECK:  %[[MASK_DECL:.*]]:2 = hlfir.declare %[[MASK_ALLOCA]] {fortran_attrs = #fir.var_attrs<allocatable>, {{.*}}}
+! CHECK:  %[[USE_MASK_DECL:.*]]:2 = hlfir.declare %[[USE_MASK]] dummy_scope %[[SCOPE]] arg 2 {{.*}}
+! CHECK:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] dummy_scope %[[SCOPE]] arg 1 {{.*}}
 if (use_mask) then
   allocate(mask(size(x, 1)))
   call set_mask(mask)
-  ! CHECK: fir.call @_QPset_mask
+  ! CHECK: fir.call @_QPset_mask(%{{.*}}) fastmath<contract> : (!fir.ref<!fir.array<?x!fir.logical<4>>>) -> ()
 end if
 iall_test_optional_4 = iall(x, mask=mask)
-! CHECK:  %[[VAL_20:.*]] = fir.load %[[VAL_3:.*]] : !fir.ref<!fir.heap<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap<!fir.array<?x!fir.logical<4>>>) -> i64
-! CHECK:  %[[VAL_22:.*]] = arith.constant 0 : i64
-! CHECK:  %[[VAL_23:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:  %[[VAL_24:.*]] = fir.load %[[VAL_4:.*]] : !fir.ref<index>
-! CHECK:  %[[VAL_25:.*]] = fir.load %[[VAL_5:.*]] : !fir.ref<index>
-! CHECK:  %[[VAL_26:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_27:.*]] = fir.shape_shift %[[VAL_24]], %[[VAL_25]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:  %[[VAL_28:.*]] = fir.embox %[[VAL_26]](%[[VAL_27]]) : (!fir.heap<!fir.array<?x!fir.logical<4>>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_29:.*]] = fir.absent !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_30:.*]] = arith.select %[[VAL_23]], %[[VAL_28]], %[[VAL_29]] : !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAll4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_37]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  %[[MASK_LOAD1:.*]] = fir.load %[[MASK_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:  %[[MASK_ADDR1:.*]] = fir.box_addr %[[MASK_LOAD1]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>) -> !fir.heap<!fir.array<?x!fir.logical<4>>>
+! CHECK:  %[[MASK_PTR_CONV:.*]] = fir.convert %[[MASK_ADDR1]] : (!fir.heap<!fir.array<?x!fir.logical<4>>>) -> i64
+! CHECK:  %[[C0_I64:.*]] = arith.constant 0 : i64
+! CHECK:  %[[IS_ALLOCATED:.*]] = arith.cmpi ne, %[[MASK_PTR_CONV]], %[[C0_I64]] : i64
+! CHECK:  %[[MASK_LOAD2:.*]] = fir.load %[[MASK_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:  %[[MASK_ADDR2:.*]] = fir.box_addr %[[MASK_LOAD2]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>) -> !fir.heap<!fir.array<?x!fir.logical<4>>>
+! CHECK:  %[[EMBOX:.*]] = fir.embox %[[MASK_ADDR2]](%{{.*}}) : (!fir.heap<!fir.array<?x!fir.logical<4>>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
+! CHECK:  %[[ABSENT:.*]] = fir.absent !fir.box<!fir.array<?x!fir.logical<4>>>
+! CHECK:  %[[SELECTED:.*]] = arith.select %[[IS_ALLOCATED]], %[[EMBOX]], %[[ABSENT]] : !fir.box<!fir.array<?x!fir.logical<4>>>
+! CHECK:  %[[CONV_X:.*]] = fir.convert %[[X_DECL]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK:  %[[CONV_MASK:.*]] = fir.convert %[[SELECTED]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
+! CHECK:  %[[RESULT:.*]] = fir.call @_FortranAIAll4(%[[CONV_X]], %{{.*}}, %{{.*}}, %{{.*}}, %[[CONV_MASK]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  hlfir.assign %[[RESULT]] to %[[RES_DECL]]#0 : i32, !fir.ref<i32>
 end function
diff --git a/flang/test/Lower/Intrinsics/iand.f90 b/flang/test/Lower/Intrinsics/iand.f90
index 0954948a62af4..1014183172dc7 100644
--- a/flang/test/Lower/Intrinsics/iand.f90
+++ b/flang/test/Lower/Intrinsics/iand.f90
@@ -1,77 +1,97 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: iand_test
-! CHECK-SAME: %[[A:.*]]: !fir.ref<i32>{{.*}}, %[[B:.*]]: !fir.ref<i32>{{.*}}, %[[C:.*]]: !fir.ref<i32>{{.*}}
+! CHECK-LABEL: func.func @_QPiand_test(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[B_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[C_ARG:.*]]: !fir.ref<i32>{{.*}})
 subroutine iand_test(a, b, c)
   integer :: a, b, c
-! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
-! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+! CHECK-DAG: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]]
+! CHECK-DAG: %[[B:.*]]:2 = hlfir.declare %[[B_ARG]]
+! CHECK-DAG: %[[C:.*]]:2 = hlfir.declare %[[C_ARG]]
+! CHECK-DAG: %[[A_VAL:.*]] = fir.load %[[A]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[B_VAL:.*]] = fir.load %[[B]]#0 : !fir.ref<i32>
   c = iand(a, b)
 ! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i32
-! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i32>
+! CHECK: hlfir.assign %[[C_VAL]] to %[[C]]#0 : i32, !fir.ref<i32>
 end subroutine iand_test
 
-! CHECK-LABEL: iand_test1
-! CHECK-SAME: %[[A:.*]]: !fir.ref<i8>{{.*}}, %[[B:.*]]: !fir.ref<i8>{{.*}}, %[[C:.*]]: !fir.ref<i8>{{.*}}
+! CHECK-LABEL: func.func @_QPiand_test1(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<i8>{{.*}}, %[[B_ARG:.*]]: !fir.ref<i8>{{.*}}, %[[C_ARG:.*]]: !fir.ref<i8>{{.*}})
 subroutine iand_test1(a, b, c)
   integer(kind=1) :: a, b, c
-! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i8>
-! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i8>
+! CHECK-DAG: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]]
+! CHECK-DAG: %[[B:.*]]:2 = hlfir.declare %[[B_ARG]]
+! CHECK-DAG: %[[C:.*]]:2 = hlfir.declare %[[C_ARG]]
+! CHECK-DAG: %[[A_VAL:.*]] = fir.load %[[A]]#0 : !fir.ref<i8>
+! CHECK-DAG: %[[B_VAL:.*]] = fir.load %[[B]]#0 : !fir.ref<i8>
   c = iand(a, b)
 ! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i8
-! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i8>
+! CHECK: hlfir.assign %[[C_VAL]] to %[[C]]#0 : i8, !fir.ref<i8>
 end subroutine iand_test1
 
-! CHECK-LABEL: iand_test2
-! CHECK-SAME: %[[A:.*]]: !fir.ref<i16>{{.*}}, %[[B:.*]]: !fir.ref<i16>{{.*}}, %[[C:.*]]: !fir.ref<i16>{{.*}}
+! CHECK-LABEL: func.func @_QPiand_test2(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<i16>{{.*}}, %[[B_ARG:.*]]: !fir.ref<i16>{{.*}}, %[[C_ARG:.*]]: !fir.ref<i16>{{.*}})
 subroutine iand_test2(a, b, c)
   integer(kind=2) :: a, b, c
-! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i16>
-! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i16>
+! CHECK-DAG: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]]
+! CHECK-DAG: %[[B:.*]]:2 = hlfir.declare %[[B_ARG]]
+! CHECK-DAG: %[[C:.*]]:2 = hlfir.declare %[[C_ARG]]
+! CHECK-DAG: %[[A_VAL:.*]] = fir.load %[[A]]#0 : !fir.ref<i16>
+! CHECK-DAG: %[[B_VAL:.*]] = fir.load %[[B]]#0 : !fir.ref<i16>
   c = iand(a, b)
 ! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i16
-! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i16>
+! CHECK: hlfir.assign %[[C_VAL]] to %[[C]]#0 : i16, !fir.ref<i16>
 end subroutine iand_test2
 
-! CHECK-LABEL: iand_test3
-! CHECK-SAME: %[[A:.*]]: !fir.ref<i32>{{.*}}, %[[B:.*]]: !fir.ref<i32>{{.*}}, %[[C:.*]]: !fir.ref<i32>{{.*}}
+! CHECK-LABEL: func.func @_QPiand_test3(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[B_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[C_ARG:.*]]: !fir.ref<i32>{{.*}})
 subroutine iand_test3(a, b, c)
   integer(kind=4) :: a, b, c
-! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i32>
-! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i32>
+! CHECK-DAG: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]]
+! CHECK-DAG: %[[B:.*]]:2 = hlfir.declare %[[B_ARG]]
+! CHECK-DAG: %[[C:.*]]:2 = hlfir.declare %[[C_ARG]]
+! CHECK-DAG: %[[A_VAL:.*]] = fir.load %[[A]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[B_VAL:.*]] = fir.load %[[B]]#0 : !fir.ref<i32>
   c = iand(a, b)
 ! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i32
-! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i32>
+! CHECK: hlfir.assign %[[C_VAL]] to %[[C]]#0 : i32, !fir.ref<i32>
 end subroutine iand_test3
 
-! CHECK-LABEL: iand_test4
-! CHECK-SAME: %[[A:.*]]: !fir.ref<i64>{{.*}}, %[[B:.*]]: !fir.ref<i64>{{.*}}, %[[C:.*]]: !fir.ref<i64>{{.*}}
+! CHECK-LABEL: func.func @_QPiand_test4(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<i64>{{.*}}, %[[B_ARG:.*]]: !fir.ref<i64>{{.*}}, %[[C_ARG:.*]]: !fir.ref<i64>{{.*}})
 subroutine iand_test4(a, b, c)
   integer(kind=8) :: a, b, c
-! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i64>
-! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i64>
+! CHECK-DAG: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]]
+! CHECK-DAG: %[[B:.*]]:2 = hlfir.declare %[[B_ARG]]
+! CHECK-DAG: %[[C:.*]]:2 = hlfir.declare %[[C_ARG]]
+! CHECK-DAG: %[[A_VAL:.*]] = fir.load %[[A]]#0 : !fir.ref<i64>
+! CHECK-DAG: %[[B_VAL:.*]] = fir.load %[[B]]#0 : !fir.ref<i64>
   c = iand(a, b)
 ! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i64
-! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i64>
+! CHECK: hlfir.assign %[[C_VAL]] to %[[C]]#0 : i64, !fir.ref<i64>
 end subroutine iand_test4
 
-! CHECK-LABEL: iand_test5
-! CHECK-SAME: %[[A:.*]]: !fir.ref<i128>{{.*}}, %[[B:.*]]: !fir.ref<i128>{{.*}}, %[[C:.*]]: !fir.ref<i128>{{.*}}
+! CHECK-LABEL: func.func @_QPiand_test5(
+! CHECK-SAME: %[[A_ARG:.*]]: !fir.ref<i128>{{.*}}, %[[B_ARG:.*]]: !fir.ref<i128>{{.*}}, %[[C_ARG:.*]]: !fir.ref<i128>{{.*}})
 subroutine iand_test5(a, b, c)
   integer(kind=16) :: a, b, c
-! CHECK: %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<i128>
-! CHECK: %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<i128>
+! CHECK-DAG: %[[A:.*]]:2 = hlfir.declare %[[A_ARG]]
+! CHECK-DAG: %[[B:.*]]:2 = hlfir.declare %[[B_ARG]]
+! CHECK-DAG: %[[C:.*]]:2 = hlfir.declare %[[C_ARG]]
+! CHECK-DAG: %[[A_VAL:.*]] = fir.load %[[A]]#0 : !fir.ref<i128>
+! CHECK-DAG: %[[B_VAL:.*]] = fir.load %[[B]]#0 : !fir.ref<i128>
   c = iand(a, b)
 ! CHECK: %[[C_VAL:.*]] = arith.andi %[[A_VAL]], %[[B_VAL]] : i128
-! CHECK: fir.store %[[C_VAL]] to %[[C]] : !fir.ref<i128>
+! CHECK: hlfir.assign %[[C_VAL]] to %[[C]]#0 : i128, !fir.ref<i128>
 end subroutine iand_test5
 
-! CHECK-LABEL: iand_test6
-! CHECK-SAME: %[[S1:.*]]: !fir.ref<i32>{{.*}}, %[[S2:.*]]: !fir.ref<i32>{{.*}}
+! CHECK-LABEL: func.func @_QPiand_test6(
+! CHECK-SAME: %[[S1_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[S2_ARG:.*]]: !fir.ref<i32>{{.*}})
 subroutine iand_test6(s1, s2)
   integer :: s1, s2
-! CHECK-DAG: %[[S1_VAL:.*]] = fir.load %[[S1]] : !fir.ref<i32>
-! CHECK-DAG: %[[S2_VAL:.*]] = fir.load %[[S2]] : !fir.ref<i32>
+! CHECK-DAG: %[[S1:.*]]:2 = hlfir.declare %[[S1_ARG]]
+! CHECK-DAG: %[[S2:.*]]:2 = hlfir.declare %[[S2_ARG]]
+! CHECK-DAG: %[[S1_VAL:.*]] = fir.load %[[S1]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[S2_VAL:.*]] = fir.load %[[S2]]#0 : !fir.ref<i32>
   stop iand(s1,s2)
 ! CHECK-DAG: %[[ANDI:.*]] = arith.andi %[[S1_VAL]], %[[S2_VAL]] : i32
 ! CHECK: fir.call @_FortranAStopStatement(%[[ANDI]], {{.*}}, {{.*}}) {{.*}}: (i32, i1, i1) -> ()
diff --git a/flang/test/Lower/Intrinsics/iany.f90 b/flang/test/Lower/Intrinsics/iany.f90
index 3b9036bb670fe..c3c57b55b15a5 100644
--- a/flang/test/Lower/Intrinsics/iany.f90
+++ b/flang/test/Lower/Intrinsics/iany.f90
@@ -1,156 +1,127 @@
-! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: func @_QPiany_test_1(
+! CHECK-LABEL: func.func @_QPiany_test_1(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi8>>{{.*}}) -> i8 {
 integer(1) function iany_test_1(a)
 integer(1) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[arg0]]
 iany_test_1 = iany(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAny1(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i8
+! CHECK: %[[A_NONE:.*]] = fir.convert %[[A]]#1 : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
+! CHECK: %{{.*}} = fir.call @_FortranAIAny1(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i8
 end function
 
-! CHECK-LABEL: func @_QPiany_test_2(
+! CHECK-LABEL: func.func @_QPiany_test_2(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi16>>{{.*}}) -> i16 {
 integer(2) function iany_test_2(a)
 integer(2) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[arg0]]
 iany_test_2 = iany(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAny2(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i16
+! CHECK: %[[A_NONE:.*]] = fir.convert %[[A]]#1 : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
+! CHECK: %{{.*}} = fir.call @_FortranAIAny2(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i16
 end function
 
-! CHECK-LABEL: func @_QPiany_test_4(
+! CHECK-LABEL: func.func @_QPiany_test_4(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) -> i32 {
 integer function iany_test_4(a)
 integer :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[arg0]]
 iany_test_4 = iany(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAny4(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK: %[[A_NONE:.*]] = fir.convert %[[A]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK: %{{.*}} = fir.call @_FortranAIAny4(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
 end function
 
-! CHECK-LABEL: func @_QPiany_test_8(
+! CHECK-LABEL: func.func @_QPiany_test_8(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi64>>{{.*}}) -> i64 {
 integer(8) function iany_test_8(a)
 integer(8) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[arg0]]
 iany_test_8 = iany(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAny8(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i64
+! CHECK: %[[A_NONE:.*]] = fir.convert %[[A]]#1 : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
+! CHECK: %{{.*}} = fir.call @_FortranAIAny8(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i64
 end function
 
-! CHECK-LABEL: func @_QPiany_test_16(
+! CHECK-LABEL: func.func @_QPiany_test_16(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi128>>{{.*}}) -> i128 {
 integer(16) function iany_test_16(a)
 integer(16) :: a(:)
-! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[arg0]]
 iany_test_16 = iany(a)
-! CHECK:  %{{.*}} = fir.call @_FortranAIAny16(%[[a3]], %{{.*}}, %{{.*}}, %[[a5]], %[[a6]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i128
+! CHECK: %[[A_NONE:.*]] = fir.convert %[[A]]#1 : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
+! CHECK: %{{.*}} = fir.call @_FortranAIAny16(%[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i128
 end function
 
-! CHECK-LABEL: func @_QPiany_test2(
+! CHECK-LABEL: func.func @_QPiany_test2(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?x?xi32>>{{.*}}, %[[arg1:.*]]: !fir.box<!fir.array<?xi32>>{{.*}}) {
 subroutine iany_test2(a,r)
 integer :: a(:,:)
 integer :: r(:)
-! CHECK-DAG:  %[[c2_i32:.*]] = arith.constant 2 : i32
-! CHECK-DAG:  %[[a0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
-! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
-! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
+! CHECK: %[[A:.*]]:2 = hlfir.declare %[[arg0]]
+! CHECK: %[[R:.*]]:2 = hlfir.declare %[[arg1]]
+! CHECK: %[[A_NONE:.*]] = fir.convert %[[A]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 r = iany(a,dim=2)
-! CHECK: fir.call @_FortranAIAnyDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
-! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
-! CHECK-DAG:  fir.freemem %[[a13]]
+! CHECK: fir.call @_FortranAIAnyDim(%{{.*}}, %[[A_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) fastmath<contract> : (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
 end subroutine
 
-! CHECK-LABEL: func @_QPiany_test_optional(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>
+! CHECK-LABEL: func.func @_QPiany_test_optional(
+! CHECK-SAME:  %[[MASK_ARG:.*]]: !fir.box<!fir.array<?x!fir.logical<4>>>{{.*}}, %[[X_ARG:.*]]: !fir.box<!fir.array<?xi32>>{{.*}})
 integer function iany_test_optional(mask, x)
 integer :: x(:)
 logical, optional :: mask(:)
+! CHECK: %[[MASK:.*]]:2 = hlfir.declare %[[MASK_ARG]]
+! CHECK: %[[X:.*]]:2 = hlfir.declare %[[X_ARG]]
 iany_test_optional = iany(x, mask=mask)
-! CHECK:  %[[VAL_9:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAny4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_9]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK: %[[X_NONE:.*]] = fir.convert %[[X]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK: %[[MASK_NONE:.*]] = fir.convert %[[MASK]]#1 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAIAny4(%[[X_NONE]], %{{.*}}, %{{.*}}, %{{.*}}, %[[MASK_NONE]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
 end function
 
-! CHECK-LABEL: func @_QPiany_test_optional_2(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+! CHECK-LABEL: func.func @_QPiany_test_optional_2(
+! CHECK-SAME:  %[[MASK_ARG:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>{{.*}})
 integer function iany_test_optional_2(mask, x)
 integer :: x(:)
 logical, pointer :: mask(:)
+! CHECK: %[[MASK:.*]]:2 = hlfir.declare %[[MASK_ARG]]
 iany_test_optional_2 = iany(x, mask=mask)
-! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
-! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.ptr<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<!fir.array<?x!fir.logical<4>>>) -> i64
-! CHECK:  %[[VAL_7:.*]] = arith.constant 0 : i64
-! CHECK:  %[[VAL_8:.*]] = arith.cmpi ne, %[[VAL_6]], %[[VAL_7]] : i64
-! CHECK:  %[[VAL_9:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
-! CHECK:  %[[VAL_10:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_11:.*]] = arith.select %[[VAL_8]], %[[VAL_9]], %[[VAL_10]] : !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAny4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_18]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK: %[[LOAD_MASK:.*]] = fir.load %[[MASK]]#0
+! CHECK: %[[MASK_ADDR:.*]] = fir.box_addr %[[LOAD_MASK]]
+! CHECK: %{{.*}} = arith.cmpi ne, %{{.*}}, %{{.*}} : i64
+! CHECK: %[[SEL_MASK:.*]] = arith.select %{{.*}}, %{{.*}}, %{{.*}} : !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
+! CHECK: %[[MASK_NONE:.*]] = fir.convert %[[SEL_MASK]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAIAny4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[MASK_NONE]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
 end function
 
-! CHECK-LABEL: func @_QPiany_test_optional_3(
-! CHECK-SAME:  %[[VAL_0:.*]]: !fir.ref<!fir.array<10x!fir.logical<4>>>
+! CHECK-LABEL: func.func @_QPiany_test_optional_3(
+! CHECK-SAME:  %[[MASK_ARG:.*]]: !fir.ref<!fir.array<10x!fir.logical<4>>>
 integer function iany_test_optional_3(mask, x)
 integer :: x(:)
 logical, optional :: mask(10)
+! CHECK: %[[MASK:.*]]:2 = hlfir.declare %[[MASK_ARG]]
 iany_test_optional_3 = iany(x, mask=mask)
-! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_5:.*]] = fir.is_present %[[VAL_0]] : (!fir.ref<!fir.array<10x!fir.logical<4>>>) -> i1
-! CHECK:  %[[VAL_6:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_7:.*]] = fir.embox %[[VAL_0]](%[[VAL_6]]) : (!fir.ref<!fir.array<10x!fir.logical<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.logical<4>>>
-! CHECK:  %[[VAL_8:.*]] = fir.absent !fir.box<!fir.array<10x!fir.logical<4>>>
-! CHECK:  %[[VAL_9:.*]] = arith.select %[[VAL_5]], %[[VAL_7]], %[[VAL_8]] : !fir.box<!fir.array<10x!fir.logical<4>>>
-! CHECK:  %[[VAL_18:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<10x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAny4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_18]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK: %[[PRESENT:.*]] = fir.is_present %[[MASK]]#0
+! CHECK: %[[EMBOX:.*]] = fir.embox %[[MASK]]#0
+! CHECK: %[[SEL_MASK:.*]] = arith.select %[[PRESENT]], %[[EMBOX]], %{{.*}} : !fir.box<!fir.array<10x!fir.logical<4>>>
+! CHECK: %[[MASK_NONE:.*]] = fir.convert %[[SEL_MASK]] : (!fir.box<!fir.array<10x!fir.logical<4>>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAIAny4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[MASK_NONE]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
 end function
 
-! CHECK-LABEL: func @_QPiany_test_optional_4(
+! CHECK-LABEL: func.func @_QPiany_test_optional_4(
 integer function iany_test_optional_4(x, use_mask)
 ! Test that local allocatable tracked in local variables
 ! are dealt as optional argument correctly.
 integer :: x(:)
 logical :: use_mask
 logical, allocatable :: mask(:)
+! CHECK: %[[MASK:.*]]:2 = hlfir.declare %{{.*}}mask"}
 if (use_mask) then
   allocate(mask(size(x, 1)))
   call set_mask(mask)
   ! CHECK: fir.call @_QPset_mask
 end if
 iany_test_optional_4 = iany(x, mask=mask)
-! CHECK:  %[[VAL_20:.*]] = fir.load %[[VAL_3:.*]] : !fir.ref<!fir.heap<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap<!fir.array<?x!fir.logical<4>>>) -> i64
-! CHECK:  %[[VAL_22:.*]] = arith.constant 0 : i64
-! CHECK:  %[[VAL_23:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:  %[[VAL_24:.*]] = fir.load %[[VAL_4:.*]] : !fir.ref<index>
-! CHECK:  %[[VAL_25:.*]] = fir.load %[[VAL_5:.*]] : !fir.ref<index>
-! CHECK:  %[[VAL_26:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_27:.*]] = fir.shape_shift %[[VAL_24]], %[[VAL_25]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:  %[[VAL_28:.*]] = fir.embox %[[VAL_26]](%[[VAL_27]]) : (!fir.heap<!fir.array<?x!fir.logical<4>>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_29:.*]] = fir.absent !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_30:.*]] = arith.select %[[VAL_23]], %[[VAL_28]], %[[VAL_29]] : !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIAny4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_37]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK: %[[LOAD_MASK:.*]] = fir.load %[[MASK]]#0
+! CHECK: %[[MASK_ADDR:.*]] = fir.box_addr %[[LOAD_MASK]]
+! CHECK: %[[CMPI:.*]] = arith.cmpi ne, %{{.*}}, %{{.*}} : i64
+! CHECK: %[[SEL_MASK:.*]] = arith.select %[[CMPI]], %{{.*}}, %{{.*}} : !fir.box<!fir.array<?x!fir.logical<4>>>
+! CHECK: %[[MASK_NONE:.*]] = fir.convert %[[SEL_MASK]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
+! CHECK: fir.call @_FortranAIAny4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[MASK_NONE]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
 end function
diff --git a/flang/test/Lower/Intrinsics/ibclr.f90 b/flang/test/Lower/Intrinsics/ibclr.f90
index a11e677bf847f..ac4ed4533a7bd 100644
--- a/flang/test/Lower/Intrinsics/ibclr.f90
+++ b/flang/test/Lower/Intrinsics/ibclr.f90
@@ -1,19 +1,21 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: ibclr_test
+! CHECK-LABEL: func.func @_QPibclr_test(
+! CHECK-SAME: %[[I_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[J_ARG:.*]]: !fir.ref<i32>{{.*}}) -> i32 {
 function ibclr_test(i, j)
-  ! CHECK-DAG: %[[result:.*]] = fir.alloca i32 {bindc_name = "ibclr_test"
-  ! CHECK-DAG: %[[i:.*]] = fir.load %arg0 : !fir.ref<i32>
-  ! CHECK-DAG: %[[j:.*]] = fir.load %arg1 : !fir.ref<i32>
-  ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : i32
-  ! CHECK-DAG: %[[VAL_6:.*]] = arith.constant -1 : i32
-  ! CHECK: %[[VAL_7:.*]] = arith.shli %[[VAL_5]], %[[j]] : i32
-  ! CHECK: %[[VAL_8:.*]] = arith.xori %[[VAL_6]], %[[VAL_7]] : i32
-  ! CHECK: %[[VAL_9:.*]] = arith.andi %[[i]], %[[VAL_8]] : i32
-  ! CHECK: fir.store %[[VAL_9]] to %[[result]] : !fir.ref<i32>
-  ! CHECK: %[[VAL_10:.*]] = fir.load %[[result]] : !fir.ref<i32>
-  ! CHECK: return %[[VAL_10]] : i32
+! CHECK-DAG: %[[I:.*]]:2 = hlfir.declare %[[I_ARG]]
+! CHECK-DAG: %[[J:.*]]:2 = hlfir.declare %[[J_ARG]]
+! CHECK-DAG: %[[RESULT_VAR:.*]] = fir.alloca i32
+! CHECK-DAG: %[[RESULT:.*]]:2 = hlfir.declare %[[RESULT_VAR]]
+! CHECK-DAG: %[[I_VAL:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[J_VAL:.*]] = fir.load %[[J]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+! CHECK-DAG: %[[CN1:.*]] = arith.constant -1 : i32
+! CHECK: %[[SHL:.*]] = arith.shli %[[C1]], %[[J_VAL]] : i32
+! CHECK: %[[XOR:.*]] = arith.xori %[[CN1]], %[[SHL]] : i32
+! CHECK: %[[AND:.*]] = arith.andi %[[I_VAL]], %[[XOR]] : i32
+! CHECK: hlfir.assign %[[AND]] to %[[RESULT]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[RET:.*]] = fir.load %[[RESULT]]#0 : !fir.ref<i32>
+! CHECK: return %[[RET]] : i32
   ibclr_test = ibclr(i, j)
 end
-
diff --git a/flang/test/Lower/Intrinsics/ibits.f90 b/flang/test/Lower/Intrinsics/ibits.f90
index 008801710a66a..7563dd605d148 100644
--- a/flang/test/Lower/Intrinsics/ibits.f90
+++ b/flang/test/Lower/Intrinsics/ibits.f90
@@ -1,23 +1,27 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: ibits_test
+! CHECK-LABEL: func.func @_QPibits_test(
+! CHECK-SAME: %[[I_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[J_ARG:.*]]: !fir.ref<i32>{{.*}}, %[[K_ARG:.*]]: !fir.ref<i32>{{.*}}) -> i32 {
 function ibits_test(i, j, k)
-  ! CHECK-DAG: %[[result:.*]] = fir.alloca i32 {bindc_name = "ibits_test"
-  ! CHECK-DAG: %[[i:.*]] = fir.load %arg0 : !fir.ref<i32>
-  ! CHECK-DAG: %[[j:.*]] = fir.load %arg1 : !fir.ref<i32>
-  ! CHECK-DAG: %[[k:.*]] = fir.load %arg2 : !fir.ref<i32>
-  ! CHECK-DAG: %[[VAL_7:.*]] = arith.constant 32 : i32
-  ! CHECK-DAG: %[[VAL_8:.*]] = arith.subi %[[VAL_7]], %[[k]] : i32
-  ! CHECK-DAG: %[[VAL_9:.*]] = arith.constant 0 : i32
-  ! CHECK-DAG: %[[VAL_10:.*]] = arith.constant -1 : i32
-  ! CHECK: %[[VAL_11:.*]] = arith.shrui %[[VAL_10]], %[[VAL_8]] : i32
-  ! CHECK: %[[VAL_12:.*]] = arith.shrsi %[[i]], %[[j]] : i32
-  ! CHECK: %[[VAL_13:.*]] = arith.andi %[[VAL_12]], %[[VAL_11]] : i32
-  ! CHECK: %[[VAL_14:.*]] = arith.cmpi eq, %[[k]], %[[VAL_9]] : i32
-  ! CHECK: %[[VAL_15:.*]] = arith.select %[[VAL_14]], %[[VAL_9]], %[[VAL_13]] : i32
-  ! CHECK: fir.store %[[VAL_15]] to %[[result]] : !fir.ref<i32>
-  ! CHECK: %[[VAL_16:.*]] = fir.load %[[result]] : !fir.ref<i32>
-  ! CHECK: return %[[VAL_16]] : i32
+! CHECK-DAG: %[[I:.*]]:2 = hlfir.declare %[[I_ARG]]
+! CHECK-DAG: %[[J:.*]]:2 = hlfir.declare %[[J_ARG]]
+! CHECK-DAG: %[[K:.*]]:2 = hlfir.declare %[[K_ARG]]
+! CHECK-DAG: %[[RESULT_VAR:.*]] = fir.alloca i32
+! CHECK-DAG: %[[RESULT:.*]]:2 = hlfir.declare %[[RESULT_VAR]]
+! CHECK-DAG: %[[I_VAL:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[J_VAL:.*]] = fir.load %[[J]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[K_VAL:.*]] = fir.load %[[K]]#0 : !fir.ref<i32>
+! CHECK-DAG: %[[C32:.*]] = arith.constant 32 : i32
+! CHECK-DAG: %[[SUB:.*]] = arith.subi %[[C32]], %[[K_VAL]] : i32
+! CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
+! CHECK-DAG: %[[CN1:.*]] = arith.constant -1 : i32
+! CHECK: %[[SHRUI:.*]] = arith.shrui %[[CN1]], %[[SUB]] : i32
+! CHECK: %[[SHRSI:.*]] = arith.shrsi %[[I_VAL]], %[[J_VAL]] : i32
+! CHECK: %[[AND:.*]] = arith.andi %[[SHRSI]], %[[SHRUI]] : i32
+! CHECK: %[[CMPI:.*]] = arith.cmpi eq, %[[K_VAL]], %[[C0]] : i32
+! CHECK: %[[SELECT:.*]] = arith.select %[[CMPI]], %[[C0]], %[[AND]] : i32
+! CHECK: hlfir.assign %[[SELECT]] to %[[RESULT]]#0 : i32, !fir.ref<i32>
+! CHECK: %[[RET:.*]] = fir.load %[[RESULT]]#0 : !fir.ref<i32>
+! CHECK: return %[[RET]] : i32
   ibits_test = ibits(i, j, k)
 end
diff --git a/flang/test/Lower/Intrinsics/ibset.f90 b/flang/test/Lower/Intrinsics/ibset.f90
index 254adf75216a1..24c0ce34a2cfa 100644
--- a/flang/test/Lower/Intrinsics/ibset.f90
+++ b/flang/test/Lower/Intrinsics/ibset.f90
@@ -1,17 +1,19 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: ibset_test
+! CHECK-LABEL: func.func @_QPibset_test(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<i32> {{.*}}, %[[ARG1:.*]]: !fir.ref<i32> {{.*}})
 function ibset_test(i, j)
-  ! CHECK-DAG: %[[result:.*]] = fir.alloca i32 {bindc_name = "ibset_test"
-  ! CHECK-DAG: %[[i:.*]] = fir.load %arg0 : !fir.ref<i32>
-  ! CHECK-DAG: %[[j:.*]] = fir.load %arg1 : !fir.ref<i32>
-  ! CHECK-DAG: %[[VAL_5:.*]] = arith.constant 1 : i32
-  ! CHECK: %[[VAL_6:.*]] = arith.shli %[[VAL_5]], %[[j]] : i32
-  ! CHECK: %[[VAL_7:.*]] = arith.ori %[[i]], %[[VAL_6]] : i32
-  ! CHECK: fir.store %[[VAL_7]] to %[[result]] : !fir.ref<i32>
-  ! CHECK: %[[VAL_8:.*]] = fir.load %[[result]] : !fir.ref<i32>
-  ! CHECK: return %[[VAL_8]] : i32
+  ! CHECK-DAG: %[[I:.*]]:2 = hlfir.declare %[[ARG0]] dummy_scope %{{.*}} arg 1 {uniq_name = "_QFibset_testEi"}
+  ! CHECK-DAG: %[[J:.*]]:2 = hlfir.declare %[[ARG1]] dummy_scope %{{.*}} arg 2 {uniq_name = "_QFibset_testEj"}
+  ! CHECK-DAG: %[[RESULT_ALLOC:.*]] = fir.alloca i32 {bindc_name = "ibset_test", uniq_name = "_QFibset_testEibset_test"}
+  ! CHECK-DAG: %[[RESULT:.*]]:2 = hlfir.declare %[[RESULT_ALLOC]] {uniq_name = "_QFibset_testEibset_test"}
+  ! CHECK-DAG: %[[I_VAL:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
+  ! CHECK-DAG: %[[J_VAL:.*]] = fir.load %[[J]]#0 : !fir.ref<i32>
+  ! CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
+  ! CHECK: %[[VAL_6:.*]] = arith.shli %[[C1]], %[[J_VAL]] : i32
+  ! CHECK: %[[VAL_7:.*]] = arith.ori %[[I_VAL]], %[[VAL_6]] : i32
+  ! CHECK: hlfir.assign %[[VAL_7]] to %[[RESULT]]#0 : i32, !fir.ref<i32>
+  ! CHECK: %[[RET_VAL:.*]] = fir.load %[[RESULT]]#0 : !fir.ref<i32>
+  ! CHECK: return %[[RET_VAL]] : i32
   ibset_test = ibset(i, j)
 end
-
diff --git a/flang/test/Lower/Intrinsics/ichar.f90 b/flang/test/Lower/Intrinsics/ichar.f90
index eb7e03873e6b7..e633a8c2b8d0b 100644
--- a/flang/test/Lower/Intrinsics/ichar.f90
+++ b/flang/test/Lower/Intrinsics/ichar.f90
@@ -1,43 +1,50 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: ichar_test
+! CHECK-LABEL: func.func @_QPichar_test(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.boxchar<1> {fir.bindc_name = "c"}) {
 subroutine ichar_test(c)
   character(1) :: c
   character :: str(10)
-  ! CHECK-DAG: %[[unbox:.*]]:2 = fir.unboxchar
-  ! CHECK-DAG: %[[BOX:.*]] = fir.convert %[[unbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1>>
-  ! CHECK-DAG: %[[J:.*]] = fir.alloca i32 {{{.*}}uniq_name = "{{.*}}Ej"}
-  ! CHECK-DAG: %[[STR:.*]] = fir.alloca !fir.array{{.*}} {{{.*}}uniq_name = "{{.*}}Estr"}
-  ! CHECK: %[[PTR:.*]] = fir.load %[[BOX]] : !fir.ref<!fir.char<1>>
-  ! CHECK: %[[CHAR:.*]] = fir.extract_value %[[PTR]], [0 : index] :
+  ! CHECK-DAG: %[[UNBOX:.*]]:2 = fir.unboxchar %[[ARG0]]
+  ! CHECK-DAG: %[[CONV:.*]] = fir.convert %[[UNBOX]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1>>
+  ! CHECK-DAG: %[[C_DECL:.*]]:2 = hlfir.declare %[[CONV]] typeparams {{.*}} dummy_scope %{{.*}} arg 1 {uniq_name = "_QFichar_testEc"}
+  ! CHECK-DAG: %[[J_ALLOC:.*]] = fir.alloca i32 {bindc_name = "j", uniq_name = "_QFichar_testEj"}
+  ! CHECK-DAG: %[[J_DECL:.*]]:2 = hlfir.declare %[[J_ALLOC]] {uniq_name = "_QFichar_testEj"}
+  ! CHECK-DAG: %[[STR_ALLOC:.*]] = fir.alloca !fir.array<10x!fir.char<1>> {bindc_name = "str", uniq_name = "_QFichar_testEstr"}
+  ! CHECK-DAG: %[[STR_DECL:.*]]:2 = hlfir.declare %[[STR_ALLOC]]({{.*}}) typeparams {{.*}} {uniq_name = "_QFichar_testEstr"}
+
+  ! CHECK: %[[C_VAL:.*]] = fir.load %[[C_DECL]]#0 : !fir.ref<!fir.char<1>>
+  ! CHECK: %[[CHAR:.*]] = fir.extract_value %[[C_VAL]], [0 : index] : (!fir.char<1>) -> i8
   ! CHECK: %[[ARG:.*]] = arith.extui %[[CHAR]] : i8 to i32
-  ! CHECK: fir.call @{{.*}}OutputInteger32{{.*}}%[[ARG]]
-  ! CHECK: fir.call @{{.*}}EndIoStatement
+  ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[ARG]])
+  ! CHECK: fir.call @_FortranAioEndIoStatement
   print *, ichar(c)
 
-  ! CHECK-DAG: %{{.*}} = fir.load %[[J]] : !fir.ref<i32>
-  ! CHECK: %[[PTR1:.*]] = fir.coordinate_of %[[STR]], %
-  ! CHECK: %[[PTR2:.*]] = fir.load %[[PTR1]] : !fir.ref<!fir.char<1>>
-  ! CHECK: %[[CHAR:.*]] = fir.extract_value %[[PTR2]], [0 : index] :
-  ! CHECK: %[[ARG:.*]] = arith.extui %[[CHAR]] : i8 to i32
-  ! CHECK: fir.call @{{.*}}OutputInteger32{{.*}}%[[ARG]]
-  ! CHECK: fir.call @{{.*}}EndIoStatement
+  ! CHECK: %[[J_VAL:.*]] = fir.load %[[J_DECL]]#0 : !fir.ref<i32>
+  ! CHECK: %[[J_IDX:.*]] = fir.convert %[[J_VAL]] : (i32) -> i64
+  ! CHECK: %[[STR_EL:.*]] = hlfir.designate %[[STR_DECL]]#0 (%[[J_IDX]])  typeparams {{.*}} : (!fir.ref<!fir.array<10x!fir.char<1>>>, i64, index) -> !fir.ref<!fir.char<1>>
+  ! CHECK: %[[STR_VAL:.*]] = fir.load %[[STR_EL]] : !fir.ref<!fir.char<1>>
+  ! CHECK: %[[CHAR2:.*]] = fir.extract_value %[[STR_VAL]], [0 : index] : (!fir.char<1>) -> i8
+  ! CHECK: %[[ARG2:.*]] = arith.extui %[[CHAR2]] : i8 to i32
+  ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[ARG2]])
+  ! CHECK: fir.call @_FortranAioEndIoStatement
   print *, ichar(str(J))
 
   ! "Magic" 88 below is the ASCII code for `X` and the value returned by IACHAR (’X’)
   ! CHECK: %[[c88:.*]] = arith.constant 88 : i32
-  ! CHECK-NEXT: fir.call @{{.*}}OutputInteger32({{.*}}, %[[c88]])
-  ! CHECK-NEXT: fir.call @{{.*}}EndIoStatement
+  ! CHECK: fir.call @_FortranAioOutputInteger32({{.*}}, %[[c88]])
+  ! CHECK: fir.call @_FortranAioEndIoStatement
   print *, iachar('X')
 end subroutine
 
 ! Check that 'arith.extui' op is not generated if type are matching.
- ! CHECK-LABEL: no_extui
+! CHECK-LABEL: func.func @_QPno_extui(
 subroutine no_extui(ch)
   integer, parameter :: kind = selected_char_kind('ISO_10646')
   character(*, kind), intent(in) :: ch(:)
   integer :: i, j
   ! CHECK-NOT: arith.extui
+  ! CHECK: %[[CHAR4:.*]] = fir.extract_value {{.*}}, [0 : index] : (!fir.char<4>) -> i32
+  ! CHECK: hlfir.assign %[[CHAR4]] to {{.*}} : i32, !fir.ref<i32>
   j = ichar(ch(i)(i:i))
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/ieee_class.f90 b/flang/test/Lower/Intrinsics/ieee_class.f90
index acef959656539..ab177e40e0d0d 100644
--- a/flang/test/Lower/Intrinsics/ieee_class.f90
+++ b/flang/test/Lower/Intrinsics/ieee_class.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
 #ifndef RK
 #define RK 8
@@ -8,7 +8,7 @@ module m
   integer, parameter :: k = RK
   character(20) :: tag(11)
 contains
-  ! CHECK-LABEL: func @_QMmPinit
+  ! CHECK-LABEL: func.func @_QMmPinit()
   subroutine init
     tag( 1) = 'signaling_nan';      tag( 2) = 'quiet_nan'
     tag( 3) = 'negative_inf';       tag( 4) = 'negative_normal'
@@ -17,7 +17,8 @@ subroutine init
     tag( 9) = 'positive_normal';    tag(10) = 'positive_inf'
     tag(11) = 'other_value'
   end
-  ! CHECK-LABEL: func @_QMmPout
+  ! CHECK-LABEL: func.func @_QMmPout(
+  ! CHECK-SAME: %[[X_ARG:.*]]: !fir.ref<f64> {{.*}}, %[[V_ARG:.*]]: !fir.ref<i32> {{.*}})
   subroutine out(x,v)
     use ieee_arithmetic
     real(k) :: x
@@ -36,53 +37,53 @@ subroutine out(x,v)
   end
 end module m
 
-! CHECK-LABEL: func @_QPclassify
+! CHECK-LABEL: func.func @_QPclassify(
+! CHECK-SAME: %[[X_ARG:.*]]: !fir.ref<f64> {{.*}})
 subroutine classify(x)
   use m; use ieee_arithmetic
   real(k) :: x
-  ! CHECK-DAG: %[[V_0:[0-9]+]] = fir.alloca i32 {adapt.valuebyref}
-  ! CHECK-DAG: %[[V_1:[0-9]+]] = fir.alloca !fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>
-  ! CHECK-DAG: %[[V_2:[0-9]+]] = fir.alloca !fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}> {bindc_name = "r", uniq_name = "_QFclassifyEr"}
+  ! CHECK-DAG: %[[R_ALLOC:.*]] = fir.alloca !fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}> {bindc_name = "r", uniq_name = "_QFclassifyEr"}
+  ! CHECK-DAG: %[[R_DECL:.*]]:2 = hlfir.declare %[[R_ALLOC]] {uniq_name = "_QFclassifyEr"}
+  ! CHECK-DAG: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_ARG]] {{.*}} {uniq_name = "_QFclassifyEx"}
   type(ieee_class_type) :: r
 
-  ! CHECK:     %[[V_8:[0-9]+]] = fir.load %arg0 : !fir.ref<f64>
-  ! CHECK:     %[[V_9:[0-9]+]] = arith.bitcast %[[V_8]] : f64 to i64
-  ! CHECK:     %[[V_10:[0-9]+]] = arith.shrui %[[V_9]], %c59{{.*}} : i64
-  ! CHECK:     %[[V_11:[0-9]+]] = arith.andi %[[V_10]], %c16{{.*}} : i64
-  ! CHECK:     %[[V_12:[0-9]+]] = arith.andi %[[V_9]], %c9218868437227405312{{.*}} : i64
-  ! CHECK:     %[[V_13:[0-9]+]] = arith.cmpi ne, %[[V_12]], %c0{{.*}} : i64
-  ! CHECK:     %[[V_14:[0-9]+]] = arith.select %[[V_13]], %c8{{.*}}, %c0{{.*}} : i64
-  ! CHECK:     %[[V_15:[0-9]+]] = arith.ori %[[V_11]], %[[V_14]] : i64
-  ! CHECK:     %[[V_16:[0-9]+]] = arith.cmpi eq, %[[V_12]], %c9218868437227405312{{.*}} : i64
-  ! CHECK:     %[[V_17:[0-9]+]] = arith.select %[[V_16]], %c4{{.*}}, %c0{{.*}} : i64
-  ! CHECK:     %[[V_18:[0-9]+]] = arith.ori %[[V_15]], %[[V_17]] : i64
-  ! CHECK:     %[[V_19:[0-9]+]] = arith.andi %[[V_9]], %c2251799813685247{{.*}} : i64
-  ! CHECK:     %[[V_20:[0-9]+]] = arith.cmpi ne, %[[V_19]], %c0{{.*}} : i64
-  ! CHECK:     %[[V_21:[0-9]+]] = arith.select %[[V_20]], %c2{{.*}}, %c0{{.*}} : i64
-  ! CHECK:     %[[V_22:[0-9]+]] = arith.ori %[[V_18]], %[[V_21]] : i64
-  ! CHECK:     %[[V_23:[0-9]+]] = arith.shrui %[[V_9]], %c51{{.*}} : i64
-  ! CHECK:     %[[V_24:[0-9]+]] = arith.andi %[[V_23]], %c1{{.*}} : i64
-  ! CHECK:     %[[V_25:[0-9]+]] = arith.ori %[[V_22]], %[[V_24]] : i64
-  ! CHECK:     %[[V_26:[0-9]+]] = fir.address_of(@_FortranAIeeeClassTable) : !fir.ref<!fir.array<32xi8>>
-  ! CHECK:     %[[V_27:[0-9]+]] = fir.coordinate_of %[[V_26]], %[[V_25]] : (!fir.ref<!fir.array<32xi8>>, i64) -> !fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>
-  ! CHECK:     %[[V_29:[0-9]+]] = fir.coordinate_of %[[V_27]], _QMieee_arithmeticTieee_class_type.which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
-  ! CHECK:     %[[V_31:[0-9]+]] = fir.coordinate_of %[[V_2]], _QMieee_arithmeticTieee_class_type.which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
-  ! CHECK:     %[[V_32:[0-9]+]] = fir.load %[[V_29]] : !fir.ref<i8>
-  ! CHECK:     fir.store %[[V_32]] to %[[V_31]] : !fir.ref<i8>
+  ! CHECK:     %[[X_VAL:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<f64>
+  ! CHECK:     %[[BITCAST:.*]] = arith.bitcast %[[X_VAL]] : f64 to i64
+  ! CHECK:     %{{.*}} = arith.shrui %[[BITCAST]], %c59{{.*}} : i64
+  ! CHECK:     %[[V_11:.*]] = arith.andi %{{.*}}, %c16{{.*}} : i64
+  ! CHECK:     %[[EXP:.*]] = arith.andi %[[BITCAST]], %c{{-?[0-9]+}}{{.*}} : i64
+  ! CHECK:     %[[EXP_NZ:.*]] = arith.cmpi ne, %[[EXP]], %c0{{.*}} : i64
+  ! CHECK:     %[[V_14:.*]] = arith.select %[[EXP_NZ]], %c8{{.*}}, %c0{{.*}} : i64
+  ! CHECK:     %[[V_15:.*]] = arith.ori %[[V_11]], %[[V_14]] : i64
+  ! CHECK:     %[[EXP_INF:.*]] = arith.cmpi eq, %[[EXP]], %c{{-?[0-9]+}}{{.*}} : i64
+  ! CHECK:     %[[V_17:.*]] = arith.select %[[EXP_INF]], %c4{{.*}}, %c0{{.*}} : i64
+  ! CHECK:     %[[V_18:.*]] = arith.ori %[[V_15]], %[[V_17]] : i64
+  ! CHECK:     %[[FRAC:.*]] = arith.andi %[[BITCAST]], %c{{[0-9]+}}{{.*}} : i64
+  ! CHECK:     %[[FRAC_NZ:.*]] = arith.cmpi ne, %[[FRAC]], %c0{{.*}} : i64
+  ! CHECK:     %[[V_21:.*]] = arith.select %[[FRAC_NZ]], %c2{{.*}}, %c0{{.*}} : i64
+  ! CHECK:     %[[V_22:.*]] = arith.ori %[[V_18]], %[[V_21]] : i64
+  ! CHECK:     %[[V_23:.*]] = arith.shrui %[[BITCAST]], %c51{{.*}} : i64
+  ! CHECK:     %[[V_24:.*]] = arith.andi %[[V_23]], %c1{{.*}} : i64
+  ! CHECK:     %[[V_25:.*]] = arith.ori %[[V_22]], %[[V_24]] : i64
+  ! CHECK:     %[[TABLE:.*]] = fir.address_of(@_FortranAIeeeClassTable) : !fir.ref<!fir.array<32xi8>>
+  ! CHECK:     %[[COORD:.*]] = fir.coordinate_of %[[TABLE]], %[[V_25]] : (!fir.ref<!fir.array<32xi8>>, i64) -> !fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>
+  ! CHECK:     %[[TMP:.*]]:2 = hlfir.declare %[[COORD]] {uniq_name = ".tmp.intrinsic_result"}
+  ! CHECK:     %[[EXPR:.*]] = hlfir.as_expr %[[TMP]]#0 move {{.*}} : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>, i1) -> !hlfir.expr<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>
+  ! CHECK:     hlfir.assign %[[EXPR]] to %[[R_DECL]]#0 : !hlfir.expr<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>, !fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>
+  ! CHECK:     hlfir.destroy %[[EXPR]] : !hlfir.expr<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>
   r = ieee_class(x)
 
 ! if (r==ieee_signaling_nan)      call out(x, 1)
 ! if (r==ieee_quiet_nan)          call out(x, 2)
-  ! CHECK:     %[[V_39:[0-9]+]] = fir.coordinate_of %[[V_1]], _QMieee_arithmeticTieee_class_type.which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
-  ! CHECK:     fir.store %c3{{.*}} to %[[V_39]] : !fir.ref<i8>
-  ! CHECK:     %[[V_41:[0-9]+]] = fir.coordinate_of %[[V_2]], _QMieee_arithmeticTieee_class_type.which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
-  ! CHECK:     %[[V_43:[0-9]+]] = fir.coordinate_of %[[V_1]], _QMieee_arithmeticTieee_class_type.which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
-  ! CHECK:     %[[V_44:[0-9]+]] = fir.load %[[V_41]] : !fir.ref<i8>
-  ! CHECK:     %[[V_45:[0-9]+]] = fir.load %[[V_43]] : !fir.ref<i8>
-  ! CHECK:     %[[V_46:[0-9]+]] = arith.cmpi eq, %[[V_44]], %[[V_45]] : i8
-  ! CHECK:     fir.if %[[V_46]] {
-  ! CHECK:       fir.store %c3{{.*}} to %[[V_0]] : !fir.ref<i32>
-  ! CHECK:       fir.call @_QMmPout(%arg0, %[[V_0]]) {{.*}} : (!fir.ref<f64>, !fir.ref<i32>) -> ()
+  ! CHECK:     %[[R_WHICH:.*]] = fir.coordinate_of %[[R_DECL]]#0, {{.*}}which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
+  ! CHECK:     %{{.*}} = fir.coordinate_of %{{.*}}, {{.*}}which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
+  ! CHECK:     %[[R_VAL:.*]] = fir.load %[[R_WHICH]] : !fir.ref<i8>
+  ! CHECK:     %[[CLASS_VAL:.*]] = fir.load %{{.*}} : !fir.ref<i8>
+  ! CHECK:     %[[EQ:.*]] = arith.cmpi eq, %[[R_VAL]], %[[CLASS_VAL]] : i8
+  ! CHECK:     fir.if %[[EQ]] {
+  ! CHECK:       %[[V_ASSOC:.*]]:3 = hlfir.associate %c3{{.*}} {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
+  ! CHECK:       fir.call @_QMmPout(%[[X_DECL]]#0, %[[V_ASSOC]]#0) {{.*}} : (!fir.ref<f64>, !fir.ref<i32>) -> ()
+  ! CHECK:       hlfir.end_associate %[[V_ASSOC]]#1, %[[V_ASSOC]]#2 : !fir.ref<i32>, i1
   ! CHECK:     }
   if (r==ieee_negative_inf)       call out(x, 3)
 ! if (r==ieee_negative_normal)    call out(x, 4)
@@ -95,7 +96,7 @@ subroutine classify(x)
 ! if (r==ieee_other_value)        call out(x,11)
 end
 
-! CHECK-LABEL: func @_QQmain
+! CHECK-LABEL: func.func @_QQmain()
 program p
   use m; use ieee_arithmetic
   real(k) :: x(10)
@@ -104,19 +105,14 @@ program p
 
 ! x(1)  = ieee_value(x(1), ieee_signaling_nan)
 ! x(2)  = ieee_value(x(1), ieee_quiet_nan)
-  ! CHECK:     %[[V_0:[0-9]+]] = fir.alloca !fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>
-  ! CHECK:     %[[V_2:[0-9]+]] = fir.address_of(@_QFEx) : !fir.ref<!fir.array<10xf64>>
-  ! CHECK:     %[[V_9:[0-9]+]] = fir.coordinate_of %[[V_0]], _QMieee_arithmeticTieee_class_type.which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
-  ! CHECK:     fir.store %c3{{.*}} to %[[V_9]] : !fir.ref<i8>
-  ! CHECK:     %[[V_11:[0-9]+]] = fir.coordinate_of %[[V_0]], _QMieee_arithmeticTieee_class_type.which : (!fir.ref<!fir.type<_QMieee_arithmeticTieee_class_type{_QMieee_arithmeticTieee_class_type.which:i8}>>) -> !fir.ref<i8>
-  ! CHECK:     %[[V_12:[0-9]+]] = fir.load %[[V_11]] : !fir.ref<i8>
-  ! CHECK:     %[[V_13:[0-9]+]] = fir.address_of(@_FortranAIeeeValueTable_8) : !fir.ref<!fir.array<12xi64>>
-  ! CHECK:     %[[V_14:[0-9]+]] = fir.coordinate_of %[[V_13]], %[[V_12]] : (!fir.ref<!fir.array<12xi64>>, i8) -> !fir.ref<i64>
-  ! CHECK:     %[[V_15:[0-9]+]] = fir.load %[[V_14]] : !fir.ref<i64>
-  ! CHECK:     %[[V_16:[0-9]+]] = arith.bitcast %[[V_15]] : i64 to f64
-  ! CHECK:     %[[V_17:[0-9]+]] = arith.subi %c3{{.*}}, %c1{{.*}} : i64
-  ! CHECK:     %[[V_18:[0-9]+]] = fir.coordinate_of %[[V_2]], %[[V_17]] : (!fir.ref<!fir.array<10xf64>>, i64) -> !fir.ref<f64>
-  ! CHECK:     fir.store %[[V_16]] to %[[V_18]] : !fir.ref<f64>
+  ! CHECK:     %[[X_G_ALLOC:.*]] = fir.address_of(@_QFEx) : !fir.ref<!fir.array<10xf64>>
+  ! CHECK:     %[[X_G_DECL:.*]]:2 = hlfir.declare %[[X_G_ALLOC]]({{.*}}) {uniq_name = "_QFEx"}
+  ! CHECK:     %[[VAL_TABLE:.*]] = fir.address_of(@_FortranAIeeeValueTable_8) : !fir.ref<!fir.array<12xi64>>
+  ! CHECK:     %[[VAL_COORD:.*]] = fir.coordinate_of %[[VAL_TABLE]], %{{.*}} : (!fir.ref<!fir.array<12xi64>>, i8) -> !fir.ref<i64>
+  ! CHECK:     %[[VAL_I64:.*]] = fir.load %[[VAL_COORD]] : !fir.ref<i64>
+  ! CHECK:     %[[VAL_F64:.*]] = arith.bitcast %[[VAL_I64]] : i64 to f64
+  ! CHECK:     %[[X3_ADDR:.*]] = hlfir.designate %[[X_G_DECL]]#0 (%c3{{.*}})  : (!fir.ref<!fir.array<10xf64>>, index) -> !fir.ref<f64>
+  ! CHECK:     hlfir.assign %[[VAL_F64]] to %[[X3_ADDR]] : f64, !fir.ref<f64>
   x(3)  = ieee_value(x(1), ieee_negative_inf)
 ! x(4)  = ieee_value(x(1), ieee_negative_normal)
 ! x(5)  = ieee_value(x(1), ieee_negative_subnormal)
diff --git a/flang/test/Lower/Intrinsics/ieee_copy_sign.f90 b/flang/test/Lower/Intrinsics/ieee_copy_sign.f90
index 13e80bc5060b9..d354bc266adad 100644
--- a/flang/test/Lower/Intrinsics/ieee_copy_sign.f90
+++ b/flang/test/Lower/Intrinsics/ieee_copy_sign.f90
@@ -1,32 +1,35 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: func @_QQmain
+! CHECK-LABEL: func.func @_QQmain()
 use ieee_arithmetic
 real(4) :: x = -2.0, y = huge(y)
 real(8) :: z = 2.0
 
-! CHECK-DAG: %[[V_0:[0-9]+]] = fir.address_of(@_QFEx) : !fir.ref<f32>
-! CHECK-DAG: %[[V_1:[0-9]+]] = fir.address_of(@_QFEy) : !fir.ref<f32>
-! CHECK-DAG: %[[V_2:[0-9]+]] = fir.address_of(@_QFEz) : !fir.ref<f64>
+! CHECK-DAG: %[[X_ADDR:.*]] = fir.address_of(@_QFEx) : !fir.ref<f32>
+! CHECK-DAG: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_ADDR]] {uniq_name = "_QFEx"}
+! CHECK-DAG: %[[Y_ADDR:.*]] = fir.address_of(@_QFEy) : !fir.ref<f32>
+! CHECK-DAG: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_ADDR]] {uniq_name = "_QFEy"}
+! CHECK-DAG: %[[Z_ADDR:.*]] = fir.address_of(@_QFEz) : !fir.ref<f64>
+! CHECK-DAG: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z_ADDR]] {uniq_name = "_QFEz"}
 
-! CHECK-DAG: %[[V_6:[0-9]+]] = fir.load %[[V_0]] : !fir.ref<f32>
-! CHECK-DAG: %[[V_7:[0-9]+]] = fir.load %[[V_1]] : !fir.ref<f32>
-! CHECK:     %[[V_8:[0-9]+]] = math.copysign %[[V_6]], %[[V_7]] fastmath<contract> : f32
-! CHECK:     %[[V_9:[0-9]+]] = fir.call @_FortranAioOutputReal32(%{{.*}}, %[[V_8]]) fastmath<contract> : (!fir.ref<i8>, f32) -> i1
+! CHECK-DAG: %[[X_VAL:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<f32>
+! CHECK-DAG: %[[Y_VAL:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<f32>
+! CHECK:     %[[CS1:.*]] = math.copysign %[[X_VAL]], %[[Y_VAL]] fastmath<contract> : f32
+! CHECK:     fir.call @_FortranAioOutputReal32({{.*}}, %[[CS1]])
 
-! CHECK-DAG: %[[V_10:[0-9]+]] = fir.load %[[V_2]] : !fir.ref<f64>
-! CHECK-DAG: %[[V_11:[0-9]+]] = fir.load %[[V_1]] : !fir.ref<f32>
-! CHECK:     %[[V_12:[0-9]+]] = arith.bitcast %[[V_10]] : f64 to i64
-! CHECK:     %[[V_13:[0-9]+]] = arith.bitcast %[[V_11]] : f32 to i32
-! CHECK:     %[[V_14:[0-9]+]] = arith.shrui %[[V_13]], %c31{{.*}} : i32
-! CHECK:     %[[V_15:[0-9]+]] = arith.shli %[[V_12]], %c1{{.*}} : i64
-! CHECK:     %[[V_16:[0-9]+]] = arith.shrui %[[V_15]], %c1{{.*}} : i64
-! CHECK-DAG: %[[V_17:[0-9]+]] = arith.shli %c1{{.*}}, %c63{{.*}} : i64
-! CHECK-DAG: %[[V_18:[0-9]+]] = arith.cmpi eq, %[[V_14]], %c0{{.*}} : i32
-! CHECK:     %[[V_19:[0-9]+]] = arith.select %[[V_18]], %c0{{.*}}, %[[V_17]] : i64
-! CHECK:     %[[V_20:[0-9]+]] = arith.ori %[[V_16]], %[[V_19]] : i64
-! CHECK:     %[[V_21:[0-9]+]] = arith.bitcast %[[V_20]] : i64 to f64
-! CHECK:     %[[V_22:[0-9]+]] = fir.call @_FortranAioOutputReal64(%{{.*}}, %[[V_21]]) fastmath<contract> : (!fir.ref<i8>, f64) -> i1
+! CHECK-DAG: %[[Z_VAL:.*]] = fir.load %[[Z_DECL]]#0 : !fir.ref<f64>
+! CHECK-DAG: %[[Y_VAL2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<f32>
+! CHECK:     %[[BIT_Z:.*]] = arith.bitcast %[[Z_VAL]] : f64 to i64
+! CHECK:     %[[BIT_Y:.*]] = arith.bitcast %[[Y_VAL2]] : f32 to i32
+! CHECK:     %[[S_Y:.*]] = arith.shrui %[[BIT_Y]], %c31{{.*}} : i32
+! CHECK:     %[[SL_Z:.*]] = arith.shli %[[BIT_Z]], %c1{{.*}} : i64
+! CHECK:     %[[SR_Z:.*]] = arith.shrui %[[SL_Z]], %c1{{.*}} : i64
+! CHECK-DAG: %[[S_BIT:.*]] = arith.shli %c1{{.*}}, %c63{{.*}} : i64
+! CHECK-DAG: %[[IS_P:.*]] = arith.cmpi eq, %[[S_Y]], %c0{{.*}} : i32
+! CHECK:     %[[S_VAL:.*]] = arith.select %[[IS_P]], %c0{{.*}}, %[[S_BIT]] : i64
+! CHECK:     %[[RES_BIT:.*]] = arith.ori %[[SR_Z]], %[[S_VAL]] : i64
+! CHECK:     %[[RES:.*]] = arith.bitcast %[[RES_BIT]] : i64 to f64
+! CHECK:     fir.call @_FortranAioOutputReal64({{.*}}, %[[RES]])
 
 print*, ieee_copy_sign(x,y), ieee_copy_sign(z,y)
 end
diff --git a/flang/test/Lower/Intrinsics/ieee_is_finite.f90 b/flang/test/Lower/Intrinsics/ieee_is_finite.f90
index ee7d8dab7b992..4f7bd19b0fca8 100644
--- a/flang/test/Lower/Intrinsics/ieee_is_finite.f90
+++ b/flang/test/Lower/Intrinsics/ieee_is_finite.f90
@@ -1,53 +1,57 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
-! CHECK-LABEL: c.func @_QPis_finite_test
+! CHECK-LABEL: func.func @_QPis_finite_test(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {{.*}}, %[[ARG1:.*]]: !fir.ref<f64> {{.*}})
 subroutine is_finite_test(x, y)
   use ieee_arithmetic, only: ieee_is_finite
   real(4) x
   real(8) y
 
-  ! CHECK:     %[[V_3:[0-9]+]] = fir.load %arg0 : !fir.ref<f32>
-  ! CHECK:     %[[V_4:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_3]]) <{bit = 504 : i32}> : (f32) -> i1
-  ! CHECK:     %[[V_5:[0-9]+]] = fir.convert %[[V_4]] : (i1) -> !fir.logical<4>
-  ! CHECK:     %[[V_6:[0-9]+]] = fir.convert %[[V_5]] : (!fir.logical<4>) -> i1
-  ! CHECK:     %[[V_7:[0-9]+]] = fir.call @_FortranAioOutputLogical(%{{.*}}, %[[V_6]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
+  ! CHECK-DAG: %[[X_DECL:.*]]:2 = hlfir.declare %[[ARG0]] {{.*}} {uniq_name = "_QFis_finite_testEx"}
+  ! CHECK-DAG: %[[Y_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {{.*}} {uniq_name = "_QFis_finite_testEy"}
+
+  ! CHECK:     %[[X_VAL:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<f32>
+  ! CHECK:     %[[IS_FINITE_X:.*]] = "llvm.intr.is.fpclass"(%[[X_VAL]]) <{bit = 504 : i32}> : (f32) -> i1
+  ! CHECK:     %[[L4_X:.*]] = fir.convert %[[IS_FINITE_X]] : (i1) -> !fir.logical<4>
+  ! CHECK:     %[[I1_X:.*]] = fir.convert %[[L4_X]] : (!fir.logical<4>) -> i1
+  ! CHECK:     fir.call @_FortranAioOutputLogical({{.*}}, %[[I1_X]])
   print*, ieee_is_finite(x)
 
-  ! CHECK:     %[[V_12:[0-9]+]] = fir.load %arg0 : !fir.ref<f32>
-  ! CHECK:     %[[V_13:[0-9]+]] = fir.load %arg0 : !fir.ref<f32>
-  ! CHECK:     %[[V_14:[0-9]+]] = arith.addf %[[V_12]], %[[V_13]] {{.*}} : f32
-  ! CHECK:     %[[V_15:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_14]]) <{bit = 504 : i32}> : (f32) -> i1
-  ! CHECK:     %[[V_16:[0-9]+]] = fir.convert %[[V_15]] : (i1) -> !fir.logical<4>
-  ! CHECK:     %[[V_17:[0-9]+]] = fir.convert %[[V_16]] : (!fir.logical<4>) -> i1
-  ! CHECK:     %[[V_18:[0-9]+]] = fir.call @_FortranAioOutputLogical(%{{.*}}, %[[V_17]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
+  ! CHECK:     %[[X_VAL1:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<f32>
+  ! CHECK:     %[[X_VAL2:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<f32>
+  ! CHECK:     %[[X_ADD:.*]] = arith.addf %[[X_VAL1]], %[[X_VAL2]] {{.*}} : f32
+  ! CHECK:     %[[IS_FINITE_XADD:.*]] = "llvm.intr.is.fpclass"(%[[X_ADD]]) <{bit = 504 : i32}> : (f32) -> i1
+  ! CHECK:     %[[L4_XADD:.*]] = fir.convert %[[IS_FINITE_XADD]] : (i1) -> !fir.logical<4>
+  ! CHECK:     %[[I1_XADD:.*]] = fir.convert %[[L4_XADD]] : (!fir.logical<4>) -> i1
+  ! CHECK:     fir.call @_FortranAioOutputLogical({{.*}}, %[[I1_XADD]])
   print*, ieee_is_finite(x+x)
 
-  ! CHECK:     %[[V_23:[0-9]+]] = fir.load %arg1 : !fir.ref<f64>
-  ! CHECK:     %[[V_24:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_23]]) <{bit = 504 : i32}> : (f64) -> i1
-  ! CHECK:     %[[V_25:[0-9]+]] = fir.convert %[[V_24]] : (i1) -> !fir.logical<4>
-  ! CHECK:     %[[V_26:[0-9]+]] = fir.convert %[[V_25]] : (!fir.logical<4>) -> i1
-  ! CHECK:     %[[V_27:[0-9]+]] = fir.call @_FortranAioOutputLogical(%{{.*}}, %[[V_26]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
+  ! CHECK:     %[[Y_VAL:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<f64>
+  ! CHECK:     %[[IS_FINITE_Y:.*]] = "llvm.intr.is.fpclass"(%[[Y_VAL]]) <{bit = 504 : i32}> : (f64) -> i1
+  ! CHECK:     %[[L4_Y:.*]] = fir.convert %[[IS_FINITE_Y]] : (i1) -> !fir.logical<4>
+  ! CHECK:     %[[I1_Y:.*]] = fir.convert %[[L4_Y]] : (!fir.logical<4>) -> i1
+  ! CHECK:     fir.call @_FortranAioOutputLogical({{.*}}, %[[I1_Y]])
   print*, ieee_is_finite(y)
 
-  ! CHECK:     %[[V_32:[0-9]+]] = fir.load %arg1 : !fir.ref<f64>
-  ! CHECK:     %[[V_33:[0-9]+]] = fir.load %arg1 : !fir.ref<f64>
-  ! CHECK:     %[[V_34:[0-9]+]] = arith.addf %[[V_32]], %[[V_33]] {{.*}} : f64
-  ! CHECK:     %[[V_35:[0-9]+]] = "llvm.intr.is.fpclass"(%[[V_34]]) <{bit = 504 : i32}> : (f64) -> i1
-  ! CHECK:     %[[V_36:[0-9]+]] = fir.convert %[[V_35]] : (i1) -> !fir.logical<4>
-  ! CHECK:     %[[V_37:[0-9]+]] = fir.convert %[[V_36]] : (!fir.logical<4>) -> i1
-  ! CHECK:     %[[V_38:[0-9]+]] = fir.call @_FortranAioOutputLogical(%{{.*}}, %[[V_37]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
+  ! CHECK:     %[[Y_VAL1:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<f64>
+  ! CHECK:     %[[Y_VAL2:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<f64>
+  ! CHECK:     %[[Y_ADD:.*]] = arith.addf %[[Y_VAL1]], %[[Y_VAL2]] {{.*}} : f64
+  ! CHECK:     %[[IS_FINITE_YADD:.*]] = "llvm.intr.is.fpclass"(%[[Y_ADD]]) <{bit = 504 : i32}> : (f64) -> i1
+  ! CHECK:     %[[L4_YADD:.*]] = fir.convert %[[IS_FINITE_YADD]] : (i1) -> !fir.logical<4>
+  ! CHECK:     %[[I1_YADD:.*]] = fir.convert %[[L4_YADD]] : (!fir.logical<4>) -> i1
+  ! CHECK:     fir.call @_FortranAioOutputLogical({{.*}}, %[[I1_YADD]])
   print*, ieee_is_finite(y+y)
 end subroutine is_finite_test
 
-! CHECK-LABEL: c.func @_QQmain
+! CHECK-LABEL: func.func @_QQmain()
   real(4) x
   real(8) y
-  ! CHECK:     %[[V_0:[0-9]+]] = fir.alloca f64 {adapt.valuebyref}
-  ! CHECK:     %[[V_1:[0-9]+]] = fir.alloca f32 {adapt.valuebyref}
-  ! CHECK:     %cst = arith.constant 3.40282347E+38 : f32
-  ! CHECK:     fir.store %cst to %[[V_1]] : !fir.ref<f32>
-  ! CHECK:     %cst_0 = arith.constant 1.7976931348623157E+308 : f64
-  ! CHECK:     fir.store %cst_0 to %[[V_0]] : !fir.ref<f64>
-  ! CHECK:     fir.call @_QPis_finite_test(%[[V_1]], %[[V_0]]) {{.*}} : (!fir.ref<f32>, !fir.ref<f64>) -> ()
+  ! CHECK:     %[[X_HUGE:.*]] = arith.constant 3.40282347E+38 : f32
+  ! CHECK:     %[[Y_HUGE:.*]] = arith.constant 1.7976931348623157E+308 : f64
+  ! CHECK:     %[[X_HUGE_ASSOC:.*]]:3 = hlfir.associate %[[X_HUGE]] {adapt.valuebyref} : (f32) -> (!fir.ref<f32>, !fir.ref<f32>, i1)
+  ! CHECK:     %[[Y_HUGE_ASSOC:.*]]:3 = hlfir.associate %[[Y_HUGE]] {adapt.valuebyref} : (f64) -> (!fir.ref<f64>, !fir.ref<f64>, i1)
+  ! CHECK:     fir.call @_QPis_finite_test(%[[X_HUGE_ASSOC]]#0, %[[Y_HUGE_ASSOC]]#0)
+  ! CHECK:     hlfir.end_associate %[[X_HUGE_ASSOC]]#1, %[[X_HUGE_ASSOC]]#2
+  ! CHECK:     hlfir.end_associate %[[Y_HUGE_ASSOC]]#1, %[[Y_HUGE_ASSOC]]#2
   call is_finite_test(huge(x), huge(y))
 end
diff --git a/flang/test/Lower/Intrinsics/ieee_operator_eq.f90 b/flang/test/Lower/Intrinsics/ieee_operator_eq.f90
index 8f77460a010fd..09ccca2841f35 100644
--- a/flang/test/Lower/Intrinsics/ieee_operator_eq.f90
+++ b/flang/test/Lower/Intrinsics/ieee_operator_eq.f90
@@ -1,21 +1,23 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
-! CHECK-LABEL: c.func @_QPs
+! CHECK-LABEL: func.func @_QPs
 subroutine s(r1,r2)
   use ieee_arithmetic, only: ieee_round_type, operator(==)
   type(ieee_round_type) :: r1, r2
-  ! CHECK:   %[[V_4:[0-9]+]] = fir.coordinate_of %arg0, _QM__fortran_builtinsT__builtin_ieee_round_type.mode : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> !fir.ref<i8>
-  ! CHECK:   %[[V_6:[0-9]+]] = fir.coordinate_of %arg1, _QM__fortran_builtinsT__builtin_ieee_round_type.mode : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> !fir.ref<i8>
-  ! CHECK:   %[[V_7:[0-9]+]] = fir.load %[[V_4]] : !fir.ref<i8>
+  ! CHECK:   %[[R1DECL:.*]]:2 = hlfir.declare %arg0 {{.*}} {uniq_name = "_QFsEr1"}
+  ! CHECK:   %[[R2DECL:.*]]:2 = hlfir.declare %arg1 {{.*}} {uniq_name = "_QFsEr2"}
+  ! CHECK:   %[[V_6:[0-9]+]] = fir.coordinate_of %[[R1DECL]]#0, _QM__fortran_builtinsT__builtin_ieee_round_type.mode
+  ! CHECK:   %[[V_7:[0-9]+]] = fir.coordinate_of %[[R2DECL]]#0, _QM__fortran_builtinsT__builtin_ieee_round_type.mode
   ! CHECK:   %[[V_8:[0-9]+]] = fir.load %[[V_6]] : !fir.ref<i8>
-  ! CHECK:   %[[V_9:[0-9]+]] = arith.cmpi eq, %[[V_7]], %[[V_8]] : i8
-  ! CHECK:   %[[V_10:[0-9]+]] = fir.call @_FortranAioOutputLogical(%{{.*}} %[[V_9]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
+  ! CHECK:   %[[V_9:[0-9]+]] = fir.load %[[V_7]] : !fir.ref<i8>
+  ! CHECK:   %[[V_10:[0-9]+]] = arith.cmpi eq, %[[V_8]], %[[V_9]] : i8
+  ! CHECK:   fir.call @_FortranAioOutputLogical(%{{.*}}, %[[V_10]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
   ! CHECK:   return
   ! CHECK: }
   print*, r1 == r2
 end
 
-! CHECK-LABEL: c.func @_QQmain
+! CHECK-LABEL: func.func @_QQmain
   use ieee_arithmetic, only: ieee_round_type, ieee_nearest, ieee_to_zero
   interface
     subroutine s(r1,r2)
@@ -24,22 +26,17 @@ subroutine s(r1,r2)
     end
   end interface
 
-  ! CHECK:   %[[V_0:[0-9]+]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>
-  ! CHECK:   %[[V_1:[0-9]+]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>
-  ! CHECK:   %[[V_2:[0-9]+]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>
-  ! CHECK:   %[[V_3:[0-9]+]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>
-  ! CHECK:   %[[V_10:[0-9]+]] = fir.coordinate_of %[[V_3]], _QM__fortran_builtinsT__builtin_ieee_round_type.mode : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> !fir.ref<i8>
-  ! CHECK:   fir.store %c0{{.*}} to %[[V_10]] : !fir.ref<i8>
-  ! CHECK:   %[[V_17:[0-9]+]] = fir.coordinate_of %[[V_2]], _QM__fortran_builtinsT__builtin_ieee_round_type.mode : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> !fir.ref<i8>
-  ! CHECK:   fir.store %c1{{.*}} to %[[V_17]] : !fir.ref<i8>
-  ! CHECK:   fir.call @_QPs(%[[V_3]], %[[V_2]]) {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> ()
+  ! CHECK:   hlfir.as_expr
+  ! CHECK:   %[[ASSOC1:.*]]:3 = hlfir.associate {{.*}} {adapt.valuebyref}
+  ! CHECK:   hlfir.as_expr
+  ! CHECK:   %[[ASSOC2:.*]]:3 = hlfir.associate {{.*}} {adapt.valuebyref}
+  ! CHECK:   fir.call @_QPs(%[[ASSOC1]]#0, %[[ASSOC2]]#0) {{.*}} : (!fir.ref<!fir.type<{{.*}}>>, !fir.ref<!fir.type<{{.*}}>>) -> ()
   call s(ieee_to_zero, ieee_nearest)
 
-  ! CHECK:   %[[V_24:[0-9]+]] = fir.coordinate_of %[[V_1]], _QM__fortran_builtinsT__builtin_ieee_round_type.mode : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> !fir.ref<i8>
-  ! CHECK:   fir.store %c1{{.*}} to %[[V_24]] : !fir.ref<i8>
-  ! CHECK:   %[[V_31:[0-9]+]] = fir.coordinate_of %[[V_0]], _QM__fortran_builtinsT__builtin_ieee_round_type.mode : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> !fir.ref<i8>
-  ! CHECK:   fir.store %c1{{.*}} to %[[V_31]] : !fir.ref<i8>
-  ! CHECK:   fir.call @_QPs(%[[V_1]], %[[V_0]]) {{.*}} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_ieee_round_type{_QM__fortran_builtinsT__builtin_ieee_round_type.mode:i8}>>) -> ()
+  ! CHECK:   hlfir.as_expr
+  ! CHECK:   %[[ASSOC3:.*]]:3 = hlfir.associate {{.*}} {adapt.valuebyref}
+  ! CHECK:   hlfir.as_expr
+  ! CHECK:   %[[ASSOC4:.*]]:3 = hlfir.associate {{.*}} {adapt.valuebyref}
+  ! CHECK:   fir.call @_QPs(%[[ASSOC3]]#0, %[[ASSOC4]]#0) {{.*}} : (!fir.ref<!fir.type<{{.*}}>>, !fir.ref<!fir.type<{{.*}}>>) -> ()
   call s(ieee_nearest, ieee_nearest)
 end
-
diff --git a/flang/test/Lower/Intrinsics/ieee_signbit.f90 b/flang/test/Lower/Intrinsics/ieee_signbit.f90
index b0bf52c4ae2b8..e8453683e4b09 100644
--- a/flang/test/Lower/Intrinsics/ieee_signbit.f90
+++ b/flang/test/Lower/Intrinsics/ieee_signbit.f90
@@ -1,24 +1,25 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
-! CHECK-LABEL: c.func @_QQmain
+! CHECK-LABEL: func.func @_QQmain
 use ieee_arithmetic
-! CHECK:     %[[V_0:[0-9]+]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFEx"}
+! CHECK:     %[[X:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFEx"}
+! CHECK:     %[[XDECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFEx"}
 ! CHECK:     %cst = arith.constant -2.000000e+00 : f32
-! CHECK:     fir.store %cst to %[[V_0]] : !fir.ref<f32>
+! CHECK:     hlfir.assign %cst to %[[XDECL]]#0 : f32, !fir.ref<f32>
 x = -2.0
 
-! CHECK:     %[[V_4:[0-9]+]] = fir.load %[[V_0]] : !fir.ref<f32>
-! CHECK:     %[[V_5:[0-9]+]] = arith.bitcast %[[V_4]] : f32 to i32
-! CHECK:     %[[V_6:[0-9]+]] = arith.shrui %[[V_5]], %c31{{.*}} : i32
-! CHECK:     %[[V_7:[0-9]+]] = fir.convert %[[V_6]] : (i32) -> !fir.logical<4>
-! CHECK:     %[[V_8:[0-9]+]] = fir.convert %[[V_7]] : (!fir.logical<4>) -> i1
-! CHECK:     %[[V_9:[0-9]+]] = fir.call @_FortranAioOutputLogical(%{{.*}}, %[[V_8]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
+! CHECK:     %[[V:.*]] = fir.load %[[XDECL]]#0 : !fir.ref<f32>
+! CHECK:     %[[BITS:.*]] = arith.bitcast %[[V]] : f32 to i32
+! CHECK:     %[[SHIFTED:.*]] = arith.shrui %[[BITS]], %{{.*}} : i32
+! CHECK:     %[[LOG:.*]] = fir.convert %[[SHIFTED]] : (i32) -> !fir.logical<4>
+! CHECK:     %[[BIT:.*]] = fir.convert %[[LOG]] : (!fir.logical<4>) -> i1
+! CHECK:     fir.call @_FortranAioOutputLogical(%{{.*}}, %[[BIT]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
 
-! CHECK:     %cst_0 = arith.constant 1.700000e+01 : f32
-! CHECK:     %[[V_10:[0-9]+]] = arith.bitcast %cst_0 : f32 to i32
-! CHECK:     %[[V_11:[0-9]+]] = arith.shrui %[[V_10]], %c31{{.*}} : i32
-! CHECK:     %[[V_12:[0-9]+]] = fir.convert %[[V_11]] : (i32) -> !fir.logical<4>
-! CHECK:     %[[V_13:[0-9]+]] = fir.convert %[[V_12]] : (!fir.logical<4>) -> i1
-! CHECK:     %[[V_14:[0-9]+]] = fir.call @_FortranAioOutputLogical(%{{.*}}, %[[V_13]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
+! CHECK:     %cst{{.*}} = arith.constant 1.700000e+01 : f32
+! CHECK:     %[[BITS2:.*]] = arith.bitcast %cst{{.*}} : f32 to i32
+! CHECK:     %[[SHIFTED2:.*]] = arith.shrui %[[BITS2]], %{{.*}} : i32
+! CHECK:     %[[LOG2:.*]] = fir.convert %[[SHIFTED2]] : (i32) -> !fir.logical<4>
+! CHECK:     %[[BIT2:.*]] = fir.convert %[[LOG2]] : (!fir.logical<4>) -> i1
+! CHECK:     fir.call @_FortranAioOutputLogical(%{{.*}}, %[[BIT2]]) {{.*}} : (!fir.ref<i8>, i1) -> i1
 print*, ieee_signbit(x), ieee_signbit(17.0)
 end
diff --git a/flang/test/Lower/Intrinsics/index.f90 b/flang/test/Lower/Intrinsics/index.f90
index 0ec8cfad83adf..0e031e83dfa45 100644
--- a/flang/test/Lower/Intrinsics/index.f90
+++ b/flang/test/Lower/Intrinsics/index.f90
@@ -1,17 +1,13 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
-! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func @_QPindex_test(
 ! CHECK-SAME: %[[s:[^:]+]]: !fir.boxchar<1>{{.*}}, %[[ss:[^:]+]]: !fir.boxchar<1>{{.*}}) -> i32
 integer function index_test(s1, s2)
   character(*) :: s1, s2
-  ! CHECK: %[[st:[^:]*]]:2 = fir.unboxchar %[[s]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-  ! CHECK: %[[sst:[^:]*]]:2 = fir.unboxchar %[[ss]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-  ! CHECK: %[[a1:.*]] = fir.convert %[[st]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
-  ! CHECK: %[[a2:.*]] = fir.convert %[[st]]#1 : (index) -> i64
-  ! CHECK: %[[a3:.*]] = fir.convert %[[sst]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<i8>
-  ! CHECK: %[[a4:.*]] = fir.convert %[[sst]]#1 : (index) -> i64
-  ! CHECK: = fir.call @_FortranAIndex1(%[[a1]], %[[a2]], %[[a3]], %[[a4]], %{{.*}}) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i1) -> i64
+  ! CHECK: %[[st:[^:]*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFindex_testEs1"}
+  ! CHECK: %[[sst:[^:]*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFindex_testEs2"}
+  ! CHECK: %[[res:.*]] = hlfir.index %[[sst]]#0 in %[[st]]#0 : (!fir.boxchar<1>, !fir.boxchar<1>) -> i32
+  ! CHECK: hlfir.assign %[[res]] to {{.*}} : i32, !fir.ref<i32>
   index_test = index(s1, s2)
 end function index_test
 
@@ -19,34 +15,20 @@ end function index_test
 ! CHECK-SAME: %[[s:[^:]+]]: !fir.boxchar<1>{{.*}}, %[[ss:[^:]+]]: !fir.boxchar<1>{{.*}}) -> i32
 integer function index_test2(s1, s2)
   character(*) :: s1, s2
-  ! CHECK: %[[mut:.*]] = fir.alloca !fir.box<!fir.heap<i32>>
-  ! CHECK: %[[st:[^:]*]]:2 = fir.unboxchar %[[s]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-  ! CHECK: %[[sst:[^:]*]]:2 = fir.unboxchar %[[ss]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
-  ! CHECK: %[[sb:.*]] = fir.embox %[[st]]#0 typeparams %[[st]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[ssb:.*]] = fir.embox %[[sst]]#0 typeparams %[[sst]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
-  ! CHECK: %[[back:.*]] = fir.embox %{{.*}} : (!fir.ref<!fir.logical<4>>) -> !fir.box<!fir.logical<4>>
-  ! CHECK: %[[hb:.*]] = fir.embox %{{.*}} : (!fir.heap<i32>) -> !fir.box<!fir.heap<i32>>
-  ! CHECK: %[[a0:.*]] = fir.convert %[[mut]] : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> !fir.ref<!fir.box<none>>
-  ! CHECK: %[[a1:.*]] = fir.convert %[[sb]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
-  ! CHECK: %[[a2:.*]] = fir.convert %[[ssb]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
-  ! CHECK: %[[a3:.*]] = fir.convert %[[back]] : (!fir.box<!fir.logical<4>>) -> !fir.box<none>
-  ! CHECK: %[[a5:.*]] = fir.convert %{{.*}} : (!fir.ref<!fir.char<1,{{.*}}>>) -> !fir.ref<i8>
-  ! CHECK:  fir.call @_FortranAIndex(%[[a0]], %[[a1]], %[[a2]], %[[a3]], %{{.*}}, %[[a5]], %{{.*}}) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, !fir.box<none>, !fir.box<none>, i32, !fir.ref<i8>, i32) -> ()
+  ! CHECK: %[[st:[^:]*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFindex_test2Es1"}
+  ! CHECK: %[[sst:[^:]*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFindex_test2Es2"}
+  ! CHECK: %true = arith.constant true
+  ! CHECK: %[[res:.*]] = hlfir.index %[[sst]]#0 in %[[st]]#0 back %true : (!fir.boxchar<1>, !fir.boxchar<1>, i1) -> i32
+  ! CHECK: hlfir.assign %[[res]] to {{.*}} : i32, !fir.ref<i32>
   index_test2 = index(s1, s2, .true., 4)
-  ! CHECK: %[[ld1:.*]] = fir.load %[[mut]] : !fir.ref<!fir.box<!fir.heap<i32>>>
-  ! CHECK: %[[ad1:.*]] = fir.box_addr %[[ld1]] : (!fir.box<!fir.heap<i32>>) -> !fir.heap<i32>
-  ! CHECK: %[[ld2:.*]] = fir.load %[[ad1]] : !fir.heap<i32>
-  ! CHECK: fir.freemem %[[ad1]]
 end function index_test2
 
 ! CHECK-LABEL: func @_QPindex_test3
 integer function index_test3(s, i)
   character(*) :: s
   integer :: i
-  ! CHECK: %[[tmpChar:.*]] = fir.alloca !fir.char<1>
-  ! CHECK: fir.store %{{.*}} to %[[tmpChar]] : !fir.ref<!fir.char<1>>
-  ! CHECK: %[[tmpCast:.*]] = fir.convert %[[tmpChar]] : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
-  ! CHECK: fir.call @_FortranAIndex1(%{{.*}}, %{{.*}}, %[[tmpCast]], %{{.*}}, %{{.*}})
+  ! CHECK: %[[st:[^:]*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFindex_test3Es"}
+  ! CHECK: hlfir.index {{.*}} in %[[st]]#0 : (!hlfir.expr<!fir.char<1>>, !fir.boxchar<1>) -> i32
   index_test3 = index(s, char(i))
 end function
 
@@ -58,24 +40,19 @@ subroutine test_optional(string, substring, back)
   character (*) :: string(:), substring
   logical, optional :: back(:)
   print *, index(string, substring, back)
-! CHECK:  %[[VAL_11:.*]] = fir.is_present %[[VAL_2]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> i1
-! CHECK:  %[[VAL_12:.*]] = fir.zero_bits !fir.ref<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_13:.*]] = arith.constant 0 : index
-! CHECK:  %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_15:.*]] = fir.embox %[[VAL_12]](%[[VAL_14]]) : (!fir.ref<!fir.array<?x!fir.logical<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_16:.*]] = arith.select %[[VAL_11]], %[[VAL_2]], %[[VAL_15]] : !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_17:.*]] = fir.array_load %[[VAL_16]] {fir.optional} : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.array<?x!fir.logical<4>>
-! CHECK:  %[[VAL_24:.*]] = fir.do_loop %[[VAL_25:.*]] = %{{.*}} to %{{.*}} step %{{.*}} unordered iter_args(%[[VAL_26:.*]] = %{{.*}}) -> (!fir.array<?xi32>) {
-  ! CHECK:  %[[VAL_31:.*]] = fir.if %[[VAL_11]] -> (!fir.logical<4>) {
-    ! CHECK:  %[[VAL_32:.*]] = fir.array_fetch %[[VAL_17]], %[[VAL_25]] : (!fir.array<?x!fir.logical<4>>, index) -> !fir.logical<4>
-    ! CHECK:  fir.result %[[VAL_32]] : !fir.logical<4>
-  ! CHECK:  } else {
-    ! CHECK:  %[[VAL_33:.*]] = arith.constant false
-    ! CHECK:  %[[VAL_34:.*]] = fir.convert %[[VAL_33]] : (i1) -> !fir.logical<4>
-    ! CHECK:  fir.result %[[VAL_34]] : !fir.logical<4>
-  ! CHECK:  }
-  ! CHECK:  %[[VAL_39:.*]] = fir.convert %[[VAL_31]] : (!fir.logical<4>) -> i1
-  ! CHECK:  fir.call @_FortranAIndex1(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_39]]) {{.*}}: (!fir.ref<i8>, i64, !fir.ref<i8>, i64, i1) -> i64
+! CHECK-DAG:  %[[BACKDECL:.*]]:2 = hlfir.declare %[[VAL_2]] {{.*}}uniq_name = "_QFtest_optionalEback"
+! CHECK-DAG:  %[[STRDECL:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}} {uniq_name = "_QFtest_optionalEstring"}
+! CHECK-DAG:  %[[SUBDECL:.*]]:2 = hlfir.declare {{.*}} {uniq_name = "_QFtest_optionalEsubstring"}
+! CHECK:  %[[ISPRES:.*]] = fir.is_present %[[BACKDECL]]#0
+! CHECK:  hlfir.elemental {{.*}} {
+! CHECK:    %[[ELEM:.*]] = hlfir.designate %[[STRDECL]]#0
+! CHECK:    fir.if %[[ISPRES]] -> (!fir.logical<4>) {
+! CHECK:      hlfir.designate %[[BACKDECL]]#0
+! CHECK:      fir.load
+! CHECK:    } else {
+! CHECK:      arith.constant false
+! CHECK:      fir.convert {{.*}} : (i1) -> !fir.logical<4>
+! CHECK:    }
+! CHECK:    hlfir.index %[[SUBDECL]]#0 in %[[ELEM]] back {{.*}} : (!fir.boxchar<1>, !fir.boxchar<1>, !fir.logical<4>) -> i32
 ! CHECK:  }
-! CHECK:  fir.array_merge_store
 end subroutine
diff --git a/flang/test/Lower/Intrinsics/iparity.f90 b/flang/test/Lower/Intrinsics/iparity.f90
index fab2b07e859c0..61d4ca76f38de 100644
--- a/flang/test/Lower/Intrinsics/iparity.f90
+++ b/flang/test/Lower/Intrinsics/iparity.f90
@@ -1,4 +1,4 @@
-! RUN: bbc --use-desc-for-alloc=false -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func @_QPiparity_test_1(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.box<!fir.array<?xi8>>{{.*}}) -> i8 {
@@ -6,7 +6,8 @@ integer(1) function iparity_test_1(a)
 integer(1) :: a(:)
 ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
 ! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
+! CHECK-DAG:  %[[adecl:.*]]:2 = hlfir.declare %[[arg0]] {{.*}} {uniq_name = "_QFiparity_test_1Ea"}
+! CHECK-DAG: %[[a3:.*]] = fir.convert %[[adecl]]#1 : (!fir.box<!fir.array<?xi8>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 iparity_test_1 = iparity(a)
@@ -19,7 +20,8 @@ integer(2) function iparity_test_2(a)
 integer(2) :: a(:)
 ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
 ! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
+! CHECK-DAG:  %[[adecl:.*]]:2 = hlfir.declare %[[arg0]] {{.*}} {uniq_name = "_QFiparity_test_2Ea"}
+! CHECK-DAG: %[[a3:.*]] = fir.convert %[[adecl]]#1 : (!fir.box<!fir.array<?xi16>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 iparity_test_2 = iparity(a)
@@ -32,7 +34,8 @@ integer function iparity_test_4(a)
 integer :: a(:)
 ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
 ! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
+! CHECK-DAG:  %[[adecl:.*]]:2 = hlfir.declare %[[arg0]] {{.*}} {uniq_name = "_QFiparity_test_4Ea"}
+! CHECK-DAG: %[[a3:.*]] = fir.convert %[[adecl]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 iparity_test_4 = iparity(a)
@@ -45,7 +48,8 @@ integer(8) function iparity_test_8(a)
 integer(8) :: a(:)
 ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
 ! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
+! CHECK-DAG:  %[[adecl:.*]]:2 = hlfir.declare %[[arg0]] {{.*}} {uniq_name = "_QFiparity_test_8Ea"}
+! CHECK-DAG: %[[a3:.*]] = fir.convert %[[adecl]]#1 : (!fir.box<!fir.array<?xi64>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 iparity_test_8 = iparity(a)
@@ -58,7 +62,8 @@ integer(16) function iparity_test_16(a)
 integer(16) :: a(:)
 ! CHECK-DAG:  %[[c0:.*]] = arith.constant 0 : index
 ! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
-! CHECK-DAG: %[[a3:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
+! CHECK-DAG:  %[[adecl:.*]]:2 = hlfir.declare %[[arg0]] {{.*}} {uniq_name = "_QFiparity_test_16Ea"}
+! CHECK-DAG: %[[a3:.*]] = fir.convert %[[adecl]]#1 : (!fir.box<!fir.array<?xi128>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a5:.*]] = fir.convert %[[c0]] : (index) -> i32
 ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 iparity_test_16 = iparity(a)
@@ -73,14 +78,12 @@ subroutine iparity_test2(a,r)
 ! CHECK-DAG:  %[[c2_i32:.*]] = arith.constant 2 : i32
 ! CHECK-DAG:  %[[a0:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>>
 ! CHECK-DAG:  %[[a1:.*]] = fir.absent !fir.box<i1>
+! CHECK-DAG:  %[[adecl:.*]]:2 = hlfir.declare %[[arg0]] {{.*}} {uniq_name = "_QFiparity_test2Ea"}
 ! CHECK-DAG:  %[[a6:.*]] = fir.convert %[[a0]] : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.ref<!fir.box<none>>
-! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[arg0]] : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
+! CHECK-DAG:  %[[a7:.*]] = fir.convert %[[adecl]]#1 : (!fir.box<!fir.array<?x?xi32>>) -> !fir.box<none>
 ! CHECK-DAG:  %[[a9:.*]] = fir.convert %[[a1]] : (!fir.box<i1>) -> !fir.box<none>
 r = iparity(a,dim=2)
 ! CHECK:  fir.call @_FortranAIParityDim(%[[a6]], %[[a7]], %[[c2_i32]], %{{.*}}, %{{.*}}, %[[a9]]) {{.*}}: (!fir.ref<!fir.box<none>>, !fir.box<none>, i32, !fir.ref<i8>, i32, !fir.box<none>) -> ()
-! CHECK-DAG: %[[a11:.*]] = fir.load %[[a0]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-! CHECK-DAG:  %[[a13:.*]] = fir.box_addr %[[a11]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
-! CHECK-DAG:  fir.freemem %[[a13]]
 end subroutine
 
 ! CHECK-LABEL: func @_QPiparity_test_optional(
@@ -89,8 +92,9 @@ integer function iparity_test_optional(mask, x)
 integer :: x(:)
 logical, optional :: mask(:)
 iparity_test_optional = iparity(x, mask=mask)
-! CHECK:  %[[VAL_9:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIParity4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_9]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  %[[maskdecl:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}uniq_name = "_QFiparity_test_optionalEmask"
+! CHECK:  %[[masknone:.*]] = fir.convert %[[maskdecl]]#1 : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
+! CHECK:  fir.call @_FortranAIParity4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[masknone]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
 end function
 
 ! CHECK-LABEL: func @_QPiparity_test_optional_2(
@@ -99,12 +103,13 @@ integer function iparity_test_optional_2(mask, x)
 integer :: x(:)
 logical, pointer :: mask(:)
 iparity_test_optional_2 = iparity(x, mask=mask)
-! CHECK:  %[[VAL_4:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:  %[[maskdecl:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}}uniq_name = "_QFiparity_test_optional_2Emask"
+! CHECK:  %[[VAL_4:.*]] = fir.load %[[maskdecl]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
 ! CHECK:  %[[VAL_5:.*]] = fir.box_addr %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.ptr<!fir.array<?x!fir.logical<4>>>
 ! CHECK:  %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ptr<!fir.array<?x!fir.logical<4>>>) -> i64
 ! CHECK:  %[[VAL_7:.*]] = arith.constant 0 : i64
 ! CHECK:  %[[VAL_8:.*]] = arith.cmpi ne, %[[VAL_6]], %[[VAL_7]] : i64
-! CHECK:  %[[VAL_9:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:  %[[VAL_9:.*]] = fir.load %[[maskdecl]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>>
 ! CHECK:  %[[VAL_10:.*]] = fir.absent !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
 ! CHECK:  %[[VAL_11:.*]] = arith.select %[[VAL_8]], %[[VAL_9]], %[[VAL_10]] : !fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>
 ! CHECK:  %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.box<!fir.ptr<!fir.array<?x!fir.logical<4>>>>) -> !fir.box<none>
@@ -118,9 +123,10 @@ integer function iparity_test_optional_3(mask, x)
 logical, optional :: mask(10)
 iparity_test_optional_3 = iparity(x, mask=mask)
 ! CHECK:  %[[VAL_2:.*]] = arith.constant 10 : index
-! CHECK:  %[[VAL_5:.*]] = fir.is_present %[[VAL_0]] : (!fir.ref<!fir.array<10x!fir.logical<4>>>) -> i1
+! CHECK:  %[[maskdecl:.*]]:2 = hlfir.declare %[[VAL_0]]{{.*}} {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFiparity_test_optional_3Emask"}
+! CHECK:  %[[VAL_5:.*]] = fir.is_present %[[maskdecl]]#0 : (!fir.ref<!fir.array<10x!fir.logical<4>>>) -> i1
 ! CHECK:  %[[VAL_6:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
-! CHECK:  %[[VAL_7:.*]] = fir.embox %[[VAL_0]](%[[VAL_6]]) : (!fir.ref<!fir.array<10x!fir.logical<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.logical<4>>>
+! CHECK:  %[[VAL_7:.*]] = fir.embox %[[maskdecl]]#0(%[[VAL_6]]) : (!fir.ref<!fir.array<10x!fir.logical<4>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.logical<4>>>
 ! CHECK:  %[[VAL_8:.*]] = fir.absent !fir.box<!fir.array<10x!fir.logical<4>>>
 ! CHECK:  %[[VAL_9:.*]] = arith.select %[[VAL_5]], %[[VAL_7]], %[[VAL_8]] : !fir.box<!fir.array<10x!fir.logical<4>>>
 ! CHECK:  %[[VAL_18:.*]] = fir.convert %[[VAL_9]] : (!fir.box<!fir.array<10x!fir.logical<4>>>) -> !fir.box<none>
@@ -140,17 +146,11 @@ integer function iparity_test_optional_4(x, use_mask)
   ! CHECK: fir.call @_QPset_mask
 end if
 iparity_test_optional_4 = iparity(x, mask=mask)
-! CHECK:  %[[VAL_20:.*]] = fir.load %[[VAL_3:.*]] : !fir.ref<!fir.heap<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (!fir.heap<!fir.array<?x!fir.logical<4>>>) -> i64
-! CHECK:  %[[VAL_22:.*]] = arith.constant 0 : i64
-! CHECK:  %[[VAL_23:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_22]] : i64
-! CHECK:  %[[VAL_24:.*]] = fir.load %[[VAL_4:.*]] : !fir.ref<index>
-! CHECK:  %[[VAL_25:.*]] = fir.load %[[VAL_5:.*]] : !fir.ref<index>
-! CHECK:  %[[VAL_26:.*]] = fir.load %[[VAL_3]] : !fir.ref<!fir.heap<!fir.array<?x!fir.logical<4>>>>
-! CHECK:  %[[VAL_27:.*]] = fir.shape_shift %[[VAL_24]], %[[VAL_25]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:  %[[VAL_28:.*]] = fir.embox %[[VAL_26]](%[[VAL_27]]) : (!fir.heap<!fir.array<?x!fir.logical<4>>>, !fir.shapeshift<1>) -> !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_29:.*]] = fir.absent !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_30:.*]] = arith.select %[[VAL_23]], %[[VAL_28]], %[[VAL_29]] : !fir.box<!fir.array<?x!fir.logical<4>>>
-! CHECK:  %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.box<!fir.array<?x!fir.logical<4>>>) -> !fir.box<none>
-! CHECK:  fir.call @_FortranAIParity4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_37]]) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
+! CHECK:  %[[maskbox:.*]] = fir.load %{{.*}} : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>>
+! CHECK:  %[[maskaddr:.*]] = fir.box_addr %[[maskbox]] : (!fir.box<!fir.heap<!fir.array<?x!fir.logical<4>>>>) -> !fir.heap<!fir.array<?x!fir.logical<4>>>
+! CHECK:  %[[maskint:.*]] = fir.convert %[[maskaddr]] : (!fir.heap<!fir.array<?x!fir.logical<4>>>) -> i64
+! CHECK:  %[[zero:.*]] = arith.constant 0 : i64
+! CHECK:  %[[isalloc:.*]] = arith.cmpi ne, %[[maskint]], %[[zero]] : i64
+! CHECK:  arith.select %[[isalloc]], {{.*}} : !fir.box<!fir.array<?x!fir.logical<4>>>
+! CHECK:  fir.call @_FortranAIParity4(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) {{.*}}: (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
 end function
diff --git a/flang/test/Lower/Intrinsics/is_contiguous.f90 b/flang/test/Lower/Intrinsics/is_contiguous.f90
index 67843d98b3d1f..7d8d5f415356d 100644
--- a/flang/test/Lower/Intrinsics/is_contiguous.f90
+++ b/flang/test/Lower/Intrinsics/is_contiguous.f90
@@ -1,21 +1,22 @@
-! RUN: bbc -emit-fir -hlfir=false %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir %s -o - | FileCheck %s
 
 ! CHECK-LABEL: func.func @_QPtest_is_contiguous(
 ! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "a"}) {
-! CHECK:         %[[VAL_1:.*]] = fir.alloca !fir.logical<4> {adapt.valuebyref}
-! CHECK:         %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {adapt.valuebyref}
-! CHECK:         %[[VAL_5:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xf32>>> {bindc_name = "p", uniq_name = "_QFtest_is_contiguousEp"}
-! CHECK:         %[[VAL_42:.*]] = fir.convert %[[VAL_0]] : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
+! CHECK:         %[[ADECL:.*]]:2 = hlfir.declare %[[VAL_0]] {{.*}} {uniq_name = "_QFtest_is_contiguousEa"}
+! CHECK:         %[[PDECL:.*]]:2 = hlfir.declare {{.*}} {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_is_contiguousEp"}
+! CHECK:         %[[VAL_42:.*]] = fir.convert %[[ADECL]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_43:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_42]]) {{.*}} : (!fir.box<none>) -> i1
 ! CHECK:         %[[VAL_44:.*]] = fir.convert %[[VAL_43]] : (i1) -> !fir.logical<4>
-! CHECK:         fir.store %[[VAL_44]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
-! CHECK:         fir.call @_QPfoo1(%[[VAL_2]]) {{.*}} : (!fir.ref<!fir.logical<4>>) -> ()
-! CHECK:         %[[VAL_45:.*]] = fir.load %[[VAL_5]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
+! CHECK:         %[[ASSOC1:.*]]:3 = hlfir.associate %[[VAL_44]] {adapt.valuebyref}
+! CHECK:         fir.call @_QPfoo1(%[[ASSOC1]]#0) {{.*}} : (!fir.ref<!fir.logical<4>>) -> ()
+! CHECK:         hlfir.end_associate %[[ASSOC1]]#1, %[[ASSOC1]]#2
+! CHECK:         %[[VAL_45:.*]] = fir.load %[[PDECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
 ! CHECK:         %[[VAL_46:.*]] = fir.convert %[[VAL_45]] : (!fir.box<!fir.ptr<!fir.array<?xf32>>>) -> !fir.box<none>
 ! CHECK:         %[[VAL_47:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_46]]) {{.*}} : (!fir.box<none>) -> i1
 ! CHECK:         %[[VAL_48:.*]] = fir.convert %[[VAL_47]] : (i1) -> !fir.logical<4>
-! CHECK:         fir.store %[[VAL_48]] to %[[VAL_1]] : !fir.ref<!fir.logical<4>>
-! CHECK:         fir.call @_QPfoo2(%[[VAL_1]]) {{.*}} : (!fir.ref<!fir.logical<4>>) -> ()
+! CHECK:         %[[ASSOC2:.*]]:3 = hlfir.associate %[[VAL_48]] {adapt.valuebyref}
+! CHECK:         fir.call @_QPfoo2(%[[ASSOC2]]#0) {{.*}} : (!fir.ref<!fir.logical<4>>) -> ()
+! CHECK:         hlfir.end_associate %[[ASSOC2]]#1, %[[ASSOC2]]#2
 ! CHECK:         return
 ! CHECK:       }
 
diff --git a/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90 b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90
new file mode 100644
index 0000000000000..eac6580c18b99
--- /dev/null
+++ b/flang/test/Lower/OpenMP/DelayedPrivatization/target-private-adjustable-array.f90
@@ -0,0 +1,81 @@
+! Tests delayed privatization for `targets ... private(..)` for adjustable arrays.
+! Tests different allocation 
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --enable-delayed-privatization-staging \
+! RUN: -mmlir --enable-gpu-heap-alloc  -o - %s 2>&1 | FileCheck %s --check-prefix=CPU
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN:   %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir  \
+! RUN:     -fopenmp -fopenmp-is-target-device \
+! RUN:     -mmlir --enable-delayed-privatization-staging \
+! RUN:     -mmlir --enable-gpu-heap-alloc \
+! RUN:     -o - %s 2>&1 | \
+! RUN:   FileCheck %s --check-prefix=GPU-HEAP \
+! RUN: %}
+
+! RUN: %if amdgpu-registered-target %{ \
+! RUN:   %flang_fc1 -triple amdgcn-amd-amdhsa -emit-hlfir  \
+! RUN:     -fopenmp -fopenmp-is-target-device \
+! RUN:     -mmlir --enable-delayed-privatization-staging \
+! RUN:     -o - %s 2>&1 | \
+! RUN:   FileCheck %s --check-prefix=GPU-STACK  \
+! RUN: %}
+
+subroutine target_adjustable_array(n_size)
+  implicit none
+  integer, intent(in) :: n_size
+  integer  :: alloc_var(n_size)
+
+  !$omp target private(alloc_var)
+    alloc_var = 1
+  !$omp end target
+end subroutine target_adjustable_array
+
+! CPU-LABEL: omp.private {type = private}
+! CPU-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! CPU-NEXT:  ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! CPU-NEXT:  %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! CPU-NEXT:  %[[C0:.*]] = arith.constant 0 : index 
+! CPU-NEXT:  %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! CPU-NEXT:  %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! CPU-NEXT:  %[[PRIVATE_MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1
+! CPU-NEXT:  %4:2 = hlfir.declare %3(%2) {[[NAME_ATTR:.*]]} : (![[HEAP_ARRAY_TYPE:.*]], !fir.shape<1>) -> (![[DESC_TYPE]], ![[HEAP_ARRAY_TYPE]])
+! CPU:      omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+! CPU-NEXT: } dealloc {
+! CPU-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE]]):
+! CPU-NEXT:  %[[PRIV_ARG_VAL1:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! CPU-NEXT:  %[[ALLOC_ADDR:.*]] = fir.box_addr %[[PRIV_ARG_VAL1]] : (![[DESC_TYPE]]) -> ![[REF_ARRAY_TYPE:.*]]
+! CPU:       %[[CONV:.*]] = fir.convert %[[ALLOC_ADDR]] : (![[REF_ARRAY_TYPE]]) -> ![[HEAP_ARRAY_TYPE]]
+! CPU-NEXT:  fir.freemem %[[CONV]] : ![[HEAP_ARRAY_TYPE]]
+! CPU:      omp.yield
+! CPU-NEXT: }
+
+! GPU-HEAP-LABEL: omp.private {type = private}
+! GPU-HEAP-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! GPU-HEAP-NEXT:  ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! GPU-HEAP-NEXT:  %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-HEAP-NEXT:  %[[C0:.*]] = arith.constant 0 : index 
+! GPU-HEAP-NEXT:  %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! GPU-HEAP-NEXT:  %[[SHAPE:.*]] = fir.shape %[[BOX_DIMS]]#1 : (index) -> !fir.shape<1>
+! GPU-HEAP-NEXT:  %[[PRIVATE_MEM:.*]] = fir.allocmem !fir.array<?xi32>, %[[BOX_DIMS]]#1
+! GPU-HEAP-NEXT:  %4:2 = hlfir.declare %3(%2) {[[NAME_ATTR:.*]]} : (![[HEAP_ARRAY_TYPE:.*]], !fir.shape<1>) -> (![[DESC_TYPE]], ![[HEAP_ARRAY_TYPE]])
+! GPU-HEAP:      omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+! GPU-HEAP-NEXT: } dealloc {
+! GPU-HEAP-NEXT: ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE]]):
+! GPU-HEAP-NEXT:  %[[PRIV_ARG_VAL1:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-HEAP-NEXT:  %[[ALLOC_ADDR:.*]] = fir.box_addr %[[PRIV_ARG_VAL1]] : (![[DESC_TYPE]]) -> ![[REF_ARRAY_TYPE:.*]]
+! GPU-HEAP:       %[[CONV:.*]] = fir.convert %[[ALLOC_ADDR]] : (![[REF_ARRAY_TYPE]]) -> ![[HEAP_ARRAY_TYPE]]
+! GPU-HEAP-NEXT:  fir.freemem %[[CONV]] : ![[HEAP_ARRAY_TYPE]]
+! GPU-HEAP:      omp.yield
+! GPU-HEAP-NEXT: }
+
+! GPU-STACK-LABEL: omp.private {type = private}
+! GPU-STACK-SAME:    @[[VAR_PRIVATIZER_SYM:.*]] : ![[DESC_TYPE:.*]] init {
+! GPU-STACK-NEXT:  ^bb0(%[[PRIV_ARG:.*]]: ![[TYPE:.*]], %[[PRIV_ALLOC:.*]]: ![[TYPE]]):
+! GPU-STACK-NEXT:  %[[PRIV_ARG_VAL:.*]] = fir.load %[[PRIV_ARG]] : ![[TYPE]]
+! GPU-STACK-NEXT:  %[[C0:.*]] = arith.constant 0 : index 
+! GPU-STACK-NEXT:  %[[BOX_DIMS:.*]]:3 = fir.box_dims %[[PRIV_ARG_VAL]], %[[C0]] : (![[DESC_TYPE]], index) -> (index, index, index)
+! GPU-STACK-NOT:   %[[PRIVATE_MEM:.*]] = fir.allocmem
+! GPU-STACK:       %[[ALLOCA_ADDR:.*]] = fir.alloca !fir.array<?xi32>, %[[BOX_DIMS]]#1 {[[NAME_ATTR:.*]]}
+! GPU-STACK:      omp.yield(%[[PRIV_ALLOC]] : ![[TYPE]])
+
diff --git a/flang/test/Lower/OpenMP/Todo/interchange.f90 b/flang/test/Lower/OpenMP/Todo/interchange.f90
new file mode 100644
index 0000000000000..123b8fc657ed9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/Todo/interchange.f90
@@ -0,0 +1,15 @@
+! Tests reduction processor behavior when a reduction symbol is not supported.
+
+! RUN: %not_todo_cmd %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+subroutine foo
+  implicit none
+  integer :: j, i
+
+  !CHECK: not yet implemented: OpenMP Interchange
+  !$omp interchange
+  do i=1,10
+    do j=1,10
+    end do
+  end do
+end subroutine
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf b/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
deleted file mode 100644
index c62f1c4173145..0000000000000
--- a/flang/test/Lower/OpenMP/delayed-privatization-cuda-device-array.cuf
+++ /dev/null
@@ -1,31 +0,0 @@
-! Test that OpenMP privatization of CUDA Fortran device arrays uses cuf.alloc
-! instead of fir.allocmem so the private copy resides in device memory.
-
-! RUN: bbc -emit-hlfir -fcuda -fopenmp %s -o - | FileCheck %s
-
-subroutine omp_private_device_array()
-  implicit none
-  integer(4), device :: a(8)
-
-  !$omp parallel private(a)
-    a(1) = 42
-  !$omp end parallel
-end subroutine
-
-! CHECK-LABEL: omp.private {type = private}
-! CHECK-SAME: @[[PRIVATIZER:.*]] : !fir.box<!fir.array<8xi32>> init {
-
-! CHECK-NEXT: ^bb0(%[[MOLD:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>, %[[PRIV:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
-! CHECK-NEXT:   %[[C8:.*]] = arith.constant 8 : index
-! CHECK-NEXT:   %[[SHAPE:.*]] = fir.shape %[[C8]]
-! CHECK-NEXT:   %[[ALLOC:.*]] = cuf.alloc !fir.array<8xi32> {bindc_name = ".tmp", data_attr = #cuf.cuda<device>}
-! CHECK-NEXT:   %[[DECL:.*]]:2 = hlfir.declare %[[ALLOC]](%[[SHAPE]]) {data_attr = #cuf.cuda<device>, uniq_name = ".tmp"}
-! CHECK:        fir.embox
-! CHECK:        fir.store
-! CHECK-NEXT:   omp.yield
-
-! CHECK: } dealloc {
-! CHECK-NEXT: ^bb0(%[[DEALLOC_ARG:.*]]: !fir.ref<!fir.box<!fir.array<8xi32>>>):
-! CHECK:        cuf.free %{{.*}} {data_attr = #cuf.cuda<device>}
-! CHECK:        omp.yield
-! CHECK-NEXT: }
diff --git a/flang/test/Lower/PowerPC/ppc-vec-store-elem-order.f90 b/flang/test/Lower/PowerPC/ppc-vec-store-elem-order.f90
index 947c8b1c7eb2c..b7dd2620568d3 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-store-elem-order.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-store-elem-order.f90
@@ -67,11 +67,11 @@ subroutine vec_xstd2_test(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x float>, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x float>, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x float>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i16, ptr %1, align 2
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i16 %[[arg2]]
@@ -93,11 +93,11 @@ subroutine vec_xstw4_test(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x float>, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x float>, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x float>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i16, ptr %1, align 2
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i16 %[[arg2]]
diff --git a/flang/test/Lower/PowerPC/ppc-vec-store.f90 b/flang/test/Lower/PowerPC/ppc-vec-store.f90
index 1c3ab9638f117..ca715f6fe525c 100644
--- a/flang/test/Lower/PowerPC/ppc-vec-store.f90
+++ b/flang/test/Lower/PowerPC/ppc-vec-store.f90
@@ -89,11 +89,11 @@ subroutine vec_st_vi4i4via4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[iextsub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[iextmul:.*]] = mul nsw i64 %[[iextsub]], 1
-! LLVMIR: %[[iextmul2:.*]] = mul nsw i64 %[[iextmul]], 1
-! LLVMIR: %[[iextadd:.*]] = add nsw i64 %[[iextmul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x i32>, ptr %2, i64 %[[iextadd]]
+! LLVMIR: %[[iextsub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[iextmul:.*]] = mul nuw nsw i64 %[[iextsub]], 1
+! LLVMIR: %[[iextmul2:.*]] = mul nuw nsw i64 %[[iextmul]], 1
+! LLVMIR: %[[iextadd:.*]] = add nuw nsw i64 %[[iextmul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x i32>, ptr %2, i64 %[[iextadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i32, ptr %1, align 4
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i32 %[[arg2]]
@@ -206,11 +206,11 @@ subroutine vec_ste_vi4i4ia4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr i32, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw i32, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i32, ptr %1, align 4
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i32 %[[arg2]]
@@ -244,11 +244,11 @@ subroutine vec_stxv_test_vi4i8ia4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr i32, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw i32, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i64, ptr %1, align 8
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i64 %[[arg2]]
@@ -278,11 +278,11 @@ subroutine vec_stxv_test_vi4i4vai4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x i32>, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x i32>, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i32, ptr %1, align 4
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i32 %[[arg2]]
@@ -317,11 +317,11 @@ subroutine vec_xst_test_vi4i8ia4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr i32, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw i32, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i64, ptr %1, align 8
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i64 %[[arg2]]
@@ -351,11 +351,11 @@ subroutine vec_xst_test_vi4i4vai4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x i32>, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x i32>, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i32, ptr %1, align 4
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i32 %[[arg2]]
@@ -390,11 +390,11 @@ subroutine vec_xst_be_test_vi4i8ia4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr i32, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw i32, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i64, ptr %1, align 8
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i64 %[[arg2]]
@@ -426,11 +426,11 @@ subroutine vec_xst_be_test_vi4i4vai4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x i32>, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x i32>, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i32, ptr %1, align 4
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i32 %[[arg2]]
@@ -467,11 +467,11 @@ subroutine vec_xstd2_test_vi4i8ia4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr i32, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw i32, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i64, ptr %1, align 8
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i64 %[[arg2]]
@@ -503,11 +503,11 @@ subroutine vec_xstd2_test_vi4i4vai4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x i32>, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x i32>, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i32, ptr %1, align 4
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i32 %[[arg2]]
@@ -543,11 +543,11 @@ subroutine vec_xstw4_test_vi4i8ia4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr i32, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw i32, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i64, ptr %1, align 8
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i64 %[[arg2]]
@@ -578,11 +578,11 @@ subroutine vec_xstw4_test_vi4i4vai4(arg1, arg2, arg3, i)
 
 ! LLVMIR: %[[i:.*]] = load i32, ptr %3, align 4
 ! LLVMIR: %[[iext:.*]] = sext i32 %[[i]] to i64
-! LLVMIR: %[[isub:.*]] = sub nsw i64 %[[iext]], 1
-! LLVMIR: %[[imul1:.*]] = mul nsw i64 %[[isub]], 1
-! LLVMIR: %[[imul2:.*]] = mul nsw i64 %[[imul1]], 1
-! LLVMIR: %[[iadd:.*]] = add nsw i64 %[[imul2]], 0
-! LLVMIR: %[[gep1:.*]] = getelementptr <4 x i32>, ptr %2, i64 %[[iadd]]
+! LLVMIR: %[[isub:.*]] = sub nuw nsw i64 %[[iext]], 1
+! LLVMIR: %[[imul1:.*]] = mul nuw nsw i64 %[[isub]], 1
+! LLVMIR: %[[imul2:.*]] = mul nuw nsw i64 %[[imul1]], 1
+! LLVMIR: %[[iadd:.*]] = add nuw nsw i64 %[[imul2]], 0
+! LLVMIR: %[[gep1:.*]] = getelementptr nusw nuw <4 x i32>, ptr %2, i64 %[[iadd]]
 ! LLVMIR: %[[arg1:.*]] = load <4 x i32>, ptr %0, align 16
 ! LLVMIR: %[[arg2:.*]] = load i32, ptr %1, align 4
 ! LLVMIR: %[[gep2:.*]] = getelementptr i8, ptr %[[gep1]], i32 %[[arg2]]
diff --git a/flang/test/Parser/OpenMP/do-interchange.f90 b/flang/test/Parser/OpenMP/do-interchange.f90
new file mode 100644
index 0000000000000..e5fbc288cef39
--- /dev/null
+++ b/flang/test/Parser/OpenMP/do-interchange.f90
@@ -0,0 +1,34 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine openmp_do_interchange(x)
+
+  integer :: x, y
+
+!CHECK: !$omp do
+!CHECK: !$omp interchange permutation
+!$omp do
+!$omp interchange permutation(2,1)
+!CHECK: do
+  do x = 1, 100
+  !CHECK: do
+    do y = 1, 100
+      call F1()
+  !CHECK: end do
+    end do
+!CHECK: end do
+  end do
+!CHECK: !$omp end interchange
+!$omp end interchange
+!$omp end do
+
+!PARSE-TREE:| | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE:| | | OmpBeginLoopDirective
+!PARSE-TREE:| | | Block
+!PARSE-TREE:| | | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE:| | | | | OmpBeginLoopDirective
+!PARSE-TREE:| | | | | | OmpDirectiveName -> llvm::omp::Directive = interchange
+!PARSE-TREE:| | | | | Block
+!PARSE-TREE:| | | | | | ExecutionPartConstruct -> ExecutableConstruct -> DoConstruct
+
+END subroutine openmp_do_interchange
diff --git a/flang/test/Parser/OpenMP/interchange-fail.f90 b/flang/test/Parser/OpenMP/interchange-fail.f90
new file mode 100644
index 0000000000000..d83ef1746f30f
--- /dev/null
+++ b/flang/test/Parser/OpenMP/interchange-fail.f90
@@ -0,0 +1,31 @@
+! RUN: split-file %s %t
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp -fopenmp-version=60 %t/stray_end1.f90 2>&1 | FileCheck %t/stray_end1.f90
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp -fopenmp-version=60 %t/stray_end2.f90 2>&1 | FileCheck %t/stray_end2.f90
+! RUN: not %flang_fc1 -fsyntax-only -fopenmp -fopenmp-version=60 %t/stray_begin.f90 2>&1 | FileCheck %t/stray_begin.f90
+
+
+!--- stray_end1.f90
+! Parser error
+
+subroutine stray_end1
+  !CHECK: error: Misplaced OpenMP end-directive
+  !$omp end interchange
+end subroutine
+
+
+!--- stray_end2.f90
+
+subroutine stray_end2
+  print *
+  !CHECK: error: Misplaced OpenMP end-directive
+  !$omp end interchange
+end subroutine
+
+
+!--- stray_begin.f90
+
+subroutine stray_begin
+  !CHECK: error: This construct should contain a DO-loop or a loop-nest-generating OpenMP construct
+  !$omp interchange permutation(2,1)
+end subroutine
+
diff --git a/flang/test/Parser/OpenMP/interchange-permutation.f90 b/flang/test/Parser/OpenMP/interchange-permutation.f90
new file mode 100644
index 0000000000000..53392e050a662
--- /dev/null
+++ b/flang/test/Parser/OpenMP/interchange-permutation.f90
@@ -0,0 +1,35 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine openmp_interchange(x)
+
+  integer :: x, y
+
+!CHECK: !$omp interchange permutation(2_4,1_4)
+!$omp interchange permutation(2,1)
+!CHECK: do
+  do x = 1, 100
+  !CHECK: do
+    do y = 1, 100
+      call F1()
+  !CHECK: end do
+    end do
+!CHECK: end do
+  end do
+!CHECK: !$omp end interchange
+!$omp end interchange
+
+!PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE:  OmpDirectiveName -> llvm::omp::Directive = interchange
+!PARSE-TREE:   OmpClauseList -> OmpClause -> Permutation -> Scalar -> Integer -> Constant -> Expr = '2_4'
+!PARSE-TREE:     LiteralConstant -> IntLiteralConstant = '2'
+!PARSE-TREE:   Scalar -> Integer -> Constant -> Expr = '1_4'
+!PARSE-TREE:     LiteralConstant -> IntLiteralConstant = '1'
+!PARSE-TREE:     Flags = {}
+!PARSE-TREE:   DoConstruct
+!PARSE-TREE:   EndDoStmt
+!PARSE-TREE: OmpEndLoopDirective
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = interchange
+
+END subroutine openmp_interchange
diff --git a/flang/test/Parser/OpenMP/interchange.f90 b/flang/test/Parser/OpenMP/interchange.f90
new file mode 100644
index 0000000000000..8aba562724428
--- /dev/null
+++ b/flang/test/Parser/OpenMP/interchange.f90
@@ -0,0 +1,30 @@
+! RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case %s
+! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s
+
+subroutine openmp_interchange(x)
+
+  integer :: x, y
+
+!CHECK: !$omp interchange
+!$omp interchange
+!CHECK: do
+  do x = 1, 100
+  !CHECK: do
+    do y = 1, 100
+      call F1()
+  !CHECK: end do
+    end do
+!CHECK: end do
+  end do
+!CHECK: !$omp end interchange
+!$omp end interchange
+
+!PARSE-TREE: OpenMPConstruct -> OpenMPLoopConstruct
+!PARSE-TREE: OmpBeginLoopDirective
+!PARSE-TREE:  OmpDirectiveName -> llvm::omp::Directive = interchange
+!PARSE-TREE:   DoConstruct
+!PARSE-TREE:   EndDoStmt
+!PARSE-TREE: OmpEndLoopDirective
+!PARSE-TREE: OmpDirectiveName -> llvm::omp::Directive = interchange
+
+END subroutine openmp_interchange
diff --git a/flang/test/Parser/shared-line-program-units.f90 b/flang/test/Parser/shared-line-program-units.f90
new file mode 100644
index 0000000000000..acdf3cc87d789
--- /dev/null
+++ b/flang/test/Parser/shared-line-program-units.f90
@@ -0,0 +1,51 @@
+!RUN: %flang_fc1 -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s
+!RUN: not %flang_fc1 -pedantic -Werror -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s --check-prefix=ERROR
+! CHECK: Program -> ProgramUnit -> SubroutineSubprogram
+! CHECK: ProgramUnit -> FunctionSubprogram
+! CHECK: ProgramUnit -> MainProgram
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+subroutine sub; end; function fn(); end; program p; end;
+! CHECK: ProgramUnit -> SubroutineSubprogram
+! CHECK: ProgramUnit -> MainProgram
+! CHECK: ProgramUnit -> MainProgram
+! CHECK: ProgramUnit -> Module
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+subroutine sub2; end; end program; end program; module m; end
+! CHECK: ProgramUnit -> BlockData
+! CHECK: ProgramUnit -> BlockData
+! CHECK: ProgramUnit -> BlockData
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+block data bd; end; block data bd2; end; block data bd3; end
+! CHECK: ProgramUnit -> Module
+! CHECK: ProgramUnit -> Submodule
+! CHECK: ProgramUnit -> Submodule
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+module sm; end; submodule (sm) sm2; end; submodule (sm:sm2) sm3; end
+! CHECK: ProgramUnit -> MainProgram
+! CHECK: ProgramUnit -> MainProgram
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+program p; end; use sm; print *, "Hello, World!"; end
+! CHECK: ProgramUnit -> MainProgram
+! CHECK: ProgramUnit -> MainProgram
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+program p; end; use sm; 
+    print *, "Hello, World!"; end
+! CHECK: ProgramUnit -> MainProgram
+! CHECK: ProgramUnit -> MainProgram
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+program p; end; use sm; print *, "Hello, World!";
+end
+! CHECK: ProgramUnit -> FunctionSubprogram
+! CHECK: ProgramUnit -> MainProgram
+function fn(); end
+10 print *, "1"; 20 print *, "2";
+end program;
+! CHECK: ProgramUnit -> FunctionSubprogram
+! CHECK: ProgramUnit -> MainProgram
+! ERROR: portability: nonstandard usage: end of program unit not terminated by new line
+function fn(); end; 10 print *, "1"; 20 print *, "2"; end program;
diff --git a/flang/test/Parser/shared-line-program-units.reject.0.f90 b/flang/test/Parser/shared-line-program-units.reject.0.f90
new file mode 100644
index 0000000000000..15e11455c372d
--- /dev/null
+++ b/flang/test/Parser/shared-line-program-units.reject.0.f90
@@ -0,0 +1,4 @@
+!RUN: not %flang_fc1 -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s
+! CHECK: error: Could not parse
+! CHECK: 4:21: error: expected '('
+program p; function fn(); end; end;
diff --git a/flang/test/Parser/shared-line-program-units.reject.1.f90 b/flang/test/Parser/shared-line-program-units.reject.1.f90
new file mode 100644
index 0000000000000..df471c815f2e3
--- /dev/null
+++ b/flang/test/Parser/shared-line-program-units.reject.1.f90
@@ -0,0 +1,4 @@
+!RUN: not %flang_fc1 -fdebug-dump-parse-tree-no-sema %s 2>&1 | FileCheck %s
+! CHECK: error: Could not parse
+! CHECK: 4:36: error: end of file
+function fn(); end; function fn2();
diff --git a/flang/test/Semantics/OpenMP/do-collapse.f90 b/flang/test/Semantics/OpenMP/do-collapse.f90
index ec6a3bdad3686..70a84c333236a 100644
--- a/flang/test/Semantics/OpenMP/do-collapse.f90
+++ b/flang/test/Semantics/OpenMP/do-collapse.f90
@@ -3,7 +3,8 @@
 ! 2.7.1 Collapse Clause
 program omp_doCollapse
   integer:: i,j
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   do i = 1,10
     do j = 1, 10
@@ -14,7 +15,8 @@ program omp_doCollapse
 
   do i = 1,10
     do j = 1, 10
-      !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+      !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+      !BECAUSE: COLLAPSE clause was specified with argument 2
       !$omp do collapse(2)
       do k = 1, 10
         print *, "hello"
@@ -23,6 +25,8 @@ program omp_doCollapse
     end do
   end do
 
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp parallel do collapse(2)
     do i = 1, 3
       !ERROR: Loop control is not present in the DO LOOP
@@ -31,7 +35,8 @@ program omp_doCollapse
       end do
     end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !ERROR: At most one COLLAPSE clause can appear on the SIMD directive
   !$omp simd collapse(2) collapse(1)
   do i = 1, 4
diff --git a/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90 b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
index 355626f6e73b9..3e382eb3cc4d4 100644
--- a/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
+++ b/flang/test/Semantics/OpenMP/do-concurrent-collapse.f90
@@ -1,6 +1,8 @@
 !RUN: %python %S/../test_errors.py %s %flang -fopenmp
 
 integer :: i, j
+! ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+! BECAUSE: COLLAPSE clause was specified with argument 2
 ! ERROR: DO CONCURRENT loops cannot be used with the COLLAPSE clause.
 !$omp parallel do collapse(2)
 do i = 1, 1
@@ -30,6 +32,8 @@
   print *, j
 end do
 
+! ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+! BECAUSE: COLLAPSE clause was specified with argument 2
 ! ERROR: DO CONCURRENT loops cannot be used with the COLLAPSE clause.
 !$omp loop collapse(2)
 do i = 1, 1
diff --git a/flang/test/Semantics/OpenMP/do-ordered.f90 b/flang/test/Semantics/OpenMP/do-ordered.f90
index 79ded3e1b6fe6..00fdd2dc966b5 100644
--- a/flang/test/Semantics/OpenMP/do-ordered.f90
+++ b/flang/test/Semantics/OpenMP/do-ordered.f90
@@ -4,7 +4,8 @@
 
 program omp_doOrdered
   integer:: i,j
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: ORDERED clause was specified with argument 3
   !$omp do ordered(3)
   do i = 1,10
     do j = 1, 10
@@ -15,7 +16,8 @@ program omp_doOrdered
 
   do i = 1,10
     do j = 1, 10
-      !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+      !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+      !BECAUSE: ORDERED clause was specified with argument 2
       !$omp do ordered(2)
       do k = 1, 10
         print *, "hello"
@@ -24,7 +26,8 @@ program omp_doOrdered
     end do
   end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: ORDERED clause was specified with argument 2
   !$omp do ordered(2)
   do i = 1,10
     !ERROR: An ORDERED directive without the DEPEND clause must be closely nested in a worksharing-loop (or worksharing-loop SIMD) region with ORDERED clause without the parameter
@@ -36,7 +39,8 @@ program omp_doOrdered
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: ORDERED clause was specified with argument 3
   !$omp do collapse(1) ordered(3)
   do i = 1,10
     do j = 1, 10
@@ -46,7 +50,8 @@ program omp_doOrdered
   !$omp end do
 
   !$omp parallel num_threads(4)
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: ORDERED clause was specified with argument 2
   !$omp do ordered(2) collapse(1)
   do i = 1,10
     !ERROR: An ORDERED directive without the DEPEND clause must be closely nested in a worksharing-loop (or worksharing-loop SIMD) region with ORDERED clause without the parameter
diff --git a/flang/test/Semantics/OpenMP/do08.f90 b/flang/test/Semantics/OpenMP/do08.f90
index 5143dff0dd315..8ab02a0d9acbf 100644
--- a/flang/test/Semantics/OpenMP/do08.f90
+++ b/flang/test/Semantics/OpenMP/do08.f90
@@ -7,7 +7,8 @@ program omp
   logical cond(10,10,10)
   cond = .false.
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
@@ -20,7 +21,8 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     do j = 0, 10
@@ -33,7 +35,8 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   do i = 0, 10
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
@@ -47,7 +50,8 @@ program omp
   !$omp end do
 
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   foo: do i = 0, 10
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
@@ -61,7 +65,8 @@ program omp
   !$omp end do
 
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   do 60 i=2,200,2
     do j=1,10
@@ -124,7 +129,8 @@ program omp
   end do foo
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: ORDERED clause was specified with argument 3
   !$omp do  collapse(2) ordered(3)
   foo: do i = 0, 10
     foo1: do j = 0, 10
diff --git a/flang/test/Semantics/OpenMP/do10.f90 b/flang/test/Semantics/OpenMP/do10.f90
index 936d94d591369..b609567c4d93d 100644
--- a/flang/test/Semantics/OpenMP/do10.f90
+++ b/flang/test/Semantics/OpenMP/do10.f90
@@ -15,7 +15,8 @@ program omp_do
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   !ERROR: The DO loop iteration variable must be of integer type
   do i = 1, 10
diff --git a/flang/test/Semantics/OpenMP/do13.f90 b/flang/test/Semantics/OpenMP/do13.f90
index 6e9d1dddade4c..895724e0a10d5 100644
--- a/flang/test/Semantics/OpenMP/do13.f90
+++ b/flang/test/Semantics/OpenMP/do13.f90
@@ -5,7 +5,8 @@
 program omp
   integer i, j, k
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
@@ -18,7 +19,8 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     do j = 0, 10
@@ -31,7 +33,8 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   do i = 0, 10
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
@@ -45,7 +48,8 @@ program omp
   !$omp end do
 
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do  collapse(2)
   foo: do i = 0, 10
     !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
@@ -59,7 +63,8 @@ program omp
   !$omp end do
 
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do collapse(3)
   do 60 i=1,10
     do j=1,10
@@ -160,7 +165,8 @@ program omp
   !$omp end parallel
 
   !$omp parallel
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: ORDERED clause was specified with argument 3
   !$omp parallel do ordered(3) collapse(2)
   foo: do i = 0, 10
     foo1: do j = 0, 10
diff --git a/flang/test/Semantics/OpenMP/do15.f90 b/flang/test/Semantics/OpenMP/do15.f90
index 45c591e66361c..939d7bfde303e 100644
--- a/flang/test/Semantics/OpenMP/do15.f90
+++ b/flang/test/Semantics/OpenMP/do15.f90
@@ -5,7 +5,8 @@
 program omp
   integer i, j, k
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     if (i .lt. 1) then
@@ -20,7 +21,8 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     do j = 0, 10
@@ -35,7 +37,6 @@ program omp
   end do
   !$omp end do
 
-  !!ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
   !$omp do  collapse(2)
   foo: do i = 0, 10
     foo1: do j = 0, 10
@@ -53,7 +54,8 @@ program omp
   !$omp end do
 
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   foo: do i = 0, 10
     foo1: do j = 0, 10
@@ -64,10 +66,10 @@ program omp
         !ERROR: CYCLE statement to non-innermost associated loop of an OpenMP DO construct
         cycle foo1
       end if
-         foo2:  do k  = 0, 10
-             print *, i, j, k
-           end do foo2
-         end do foo1
+      foo2: do k  = 0, 10
+        print *, i, j, k
+      end do foo2
+    end do foo1
   end do foo
   !$omp end do
 
diff --git a/flang/test/Semantics/OpenMP/do16.f90 b/flang/test/Semantics/OpenMP/do16.f90
index 15d13f683cf12..e671c73783e1c 100644
--- a/flang/test/Semantics/OpenMP/do16.f90
+++ b/flang/test/Semantics/OpenMP/do16.f90
@@ -5,7 +5,8 @@
 program omp
   integer i, j, k
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     select case (i)
@@ -21,7 +22,8 @@ program omp
   end do
   !$omp end do
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 3, but the associated nest is a perfect nest of depth 2
+  !BECAUSE: COLLAPSE clause was specified with argument 3
   !$omp do  collapse(3)
   do i = 0, 10
     do j = 0, 10
diff --git a/flang/test/Semantics/OpenMP/do22.f90 b/flang/test/Semantics/OpenMP/do22.f90
index 9d96d3af54e5c..dc38bd5d23253 100644
--- a/flang/test/Semantics/OpenMP/do22.f90
+++ b/flang/test/Semantics/OpenMP/do22.f90
@@ -4,7 +4,8 @@
 subroutine do_imperfectly_nested_before
   integer i, j
 
-  !ERROR: The value of the parameter in the COLLAPSE or ORDERED clause must not be larger than the number of nested loops following the construct.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do collapse(2)
   do i = 1, 10
     print *, i
@@ -19,7 +20,8 @@ subroutine do_imperfectly_nested_before
 subroutine do_imperfectly_nested_behind
   integer i, j
 
-  !ERROR: Canonical loop nest must be perfectly nested.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: COLLAPSE clause was specified with argument 2
   !$omp do collapse(2)
   do i = 1, 10
     do j = 1, 10
diff --git a/flang/test/Semantics/OpenMP/fuse1.f90 b/flang/test/Semantics/OpenMP/fuse1.f90
index 0616a3c52786d..4dab01ca3ec26 100644
--- a/flang/test/Semantics/OpenMP/fuse1.f90
+++ b/flang/test/Semantics/OpenMP/fuse1.f90
@@ -7,7 +7,8 @@ subroutine f
   integer :: i
 
   !$omp do
-  !ERROR: The specified loop range requires 2 loops, but the loop sequence has a length of 1
+  !ERROR: This construct requires a sequence of 2 loops, but the loop sequence has a length of 1
+  !BECAUSE: LOOPRANGE clause was specified with a count of 2 starting at loop 1
   !$omp fuse looprange(1, 2)
   !$omp fuse
   do i = 1, 10
diff --git a/flang/test/Semantics/OpenMP/interchange-permutation.f90 b/flang/test/Semantics/OpenMP/interchange-permutation.f90
new file mode 100644
index 0000000000000..4a187803e37e6
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/interchange-permutation.f90
@@ -0,0 +1,109 @@
+! Testing the Semantics of interchange
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60
+
+
+subroutine double_permutation
+  implicit none
+  integer i, j
+
+  !ERROR: At most one PERMUTATION clause can appear on the INTERCHANGE directive
+  !$omp interchange permutation(2,1) permutation(2,1)
+  do i = 1, 5
+  do j = 1, 5
+    print *, i
+  end do
+  end do
+end subroutine
+
+subroutine zero_parameter
+  implicit none
+  integer i, j
+
+  !ERROR: The parameter of the PERMUTATION clause must be a constant positive integer expression
+  !$omp interchange permutation(0,1)
+  do i = 1, 5
+  do j = 1, 5
+    print *, i
+  end do
+  end do
+end subroutine
+
+
+subroutine negative_parameter
+  implicit none
+  integer i, j
+
+  !ERROR: The parameter of the PERMUTATION clause must be a constant positive integer expression
+  !$omp interchange permutation(2,-1)
+  do i = 1, 5
+  do j = 1, 5
+    print *, i
+  end do
+  end do
+end subroutine
+
+
+subroutine constant_parameter
+  implicit none
+  integer i, j, a
+
+  !ERROR: Must be a constant value
+  !$omp interchange permutation(2,a)
+  do i = 1, 5
+  do j = 1, 5
+    print *, i
+  end do
+  end do
+end subroutine
+
+subroutine insufficient_loops
+  implicit none
+  integer i
+
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !ERROR: PERMUTATION clause was specified with 2 arguments
+  !$omp interchange permutation(2, 1)
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
+
+subroutine minimum_parameters
+  implicit none
+  integer i, j
+
+  !ERROR: The PERMUTATION clause must have a length of at least two
+  !$omp interchange permutation(1)
+  do i = 1, 5
+    do j = 1, 5
+      print *, i
+    end do
+  end do
+end subroutine
+
+subroutine parameter_number
+  implicit none
+  integer i, j
+
+  !ERROR: Every integer from 1 must appear in the PERMUTATION clause
+  !$omp interchange permutation(1,1)
+  do i = 1, 5
+    do j = 1, 5
+      print *, i
+    end do
+  end do
+end subroutine
+
+subroutine parameter_number2
+  implicit none
+  integer i, j
+
+  !ERROR: Every integer from 1 must appear in the PERMUTATION clause
+  !$omp interchange permutation(1,3)
+  do i = 1, 5
+    do j = 1, 5
+      print *, i
+    end do
+  end do
+end subroutine
+
diff --git a/flang/test/Semantics/OpenMP/interchange01.f90 b/flang/test/Semantics/OpenMP/interchange01.f90
new file mode 100644
index 0000000000000..0bbd5335dca87
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/interchange01.f90
@@ -0,0 +1,44 @@
+! Testing the Semantics of interchange
+!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=51
+
+
+subroutine on_unroll
+  implicit none
+  integer i, j
+
+  !ERROR: OpenMP loop construct cannot apply to a fully unrolled loop
+  !$omp interchange
+  !$omp unroll
+  do i = 1, 5
+    do j = 1, 5
+      print *, i
+    end do
+  end do
+end subroutine
+
+subroutine loop_assoc
+  implicit none
+  integer :: i, j
+
+  !$omp interchange
+  !ERROR: The associated loop of a loop-associated directive cannot be a DO WHILE.
+  do while (i <= 10)
+    do j = 1, 5
+      i = i + 1
+      print *, i
+    end do
+  end do
+end subroutine
+
+subroutine insufficient_loops
+  implicit none
+  integer i
+
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: PERMUTATION clause was not specified, PERMUTATION(2, 1) was assumed
+  !$omp interchange 
+  do i = 1, 5
+    print *, i
+  end do
+end subroutine
+
diff --git a/flang/test/Semantics/OpenMP/loop-transformation-clauses01.f90 b/flang/test/Semantics/OpenMP/loop-transformation-clauses01.f90
index 5e3d32d7c6eff..8e7398fdaaecf 100644
--- a/flang/test/Semantics/OpenMP/loop-transformation-clauses01.f90
+++ b/flang/test/Semantics/OpenMP/loop-transformation-clauses01.f90
@@ -20,7 +20,8 @@ subroutine loop_transformation_construct1
   end do
   !$omp end fuse
 
-  !ERROR: The specified loop range requires 6 loops, but the loop sequence has a length of 2
+  !ERROR: This construct requires a sequence of 6 loops, but the loop sequence has a length of 2
+  !BECAUSE: LOOPRANGE clause was specified with a count of 2 starting at loop 5
   !$omp fuse looprange(5,2)
   do x = 1, i
     v(x) = x * 2
diff --git a/flang/test/Semantics/OpenMP/tile05.f90 b/flang/test/Semantics/OpenMP/tile05.f90
index 70c43811a5832..36c4b299a7c54 100644
--- a/flang/test/Semantics/OpenMP/tile05.f90
+++ b/flang/test/Semantics/OpenMP/tile05.f90
@@ -6,7 +6,8 @@ subroutine insufficient_loops
   implicit none
   integer i
 
-  !ERROR: The SIZES clause has more entries than there are nested canonical loops.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: SIZES clause was specified with 2 arguments
   !$omp tile sizes(2, 2)
   do i = 1, 5
     print *, i
diff --git a/flang/test/Semantics/OpenMP/tile07.f90 b/flang/test/Semantics/OpenMP/tile07.f90
index 70a6f5fc529a4..9642fe10013fd 100644
--- a/flang/test/Semantics/OpenMP/tile07.f90
+++ b/flang/test/Semantics/OpenMP/tile07.f90
@@ -6,7 +6,8 @@ subroutine non_perfectly_nested_loop_behind
   implicit none
   integer i, j
 
-  !ERROR: Canonical loop nest must be perfectly nested.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: SIZES clause was specified with 2 arguments
   !$omp tile sizes(2,2)
   do i = 1, 5
     do j = 1, 42
@@ -21,7 +22,8 @@ subroutine non_perfectly_nested_loop_before
   implicit none
   integer i, j
 
-  !ERROR: The SIZES clause has more entries than there are nested canonical loops.
+  !ERROR: This construct requires a perfect nest of depth 2, but the associated nest is a perfect nest of depth 1
+  !BECAUSE: SIZES clause was specified with 2 arguments
   !$omp tile sizes(2,2)
   do i = 1, 5
     print *, i
diff --git a/flang/test/Semantics/negate-literal-typedexpr.f90 b/flang/test/Semantics/negate-literal-typedexpr.f90
new file mode 100644
index 0000000000000..282839377f4a9
--- /dev/null
+++ b/flang/test/Semantics/negate-literal-typedexpr.f90
@@ -0,0 +1,20 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! The GenericExprWrapper for the negated literal 2 is nullptr. Usually
+! it would be non-null, but contain std::nullopt. Make sure we don't
+! crash on this.
+
+!CHECK: omp.teams
+
+subroutine f(array)
+  implicit none
+  real :: array(:)
+  integer s
+  !$omp target teams distribute parallel do
+  do s = 1, 3
+    !The "2" in "Negate 2" does not have TypedExpr.
+    array(-2 + s) = 1.0
+  end do
+  !$omp end target teams distribute parallel do
+end
+
diff --git a/flang/test/Semantics/test_errors.py b/flang/test/Semantics/test_errors.py
index 45684764a00e4..05ae6b8309de6 100755
--- a/flang/test/Semantics/test_errors.py
+++ b/flang/test/Semantics/test_errors.py
@@ -15,12 +15,47 @@
 
 from difflib import unified_diff
 
+# When messages are attached together, the source locations to which they
+# refer are not necessarily monotonically increasing. For example
+#   error: foo.f90:10: There is a problem here         # line 10
+#   because: foo.f90:12: This thing is invalid         # line 12 (attached)
+#   error: foo.f90:11: There is another problem here   # line 11
+# There is no way to represent that in the source file via ERROR annotations,
+# so before running unified_diff "canonicalize" the list of messages into an
+# order that corresponds to the line numbers.
+#
+# This also eliminates the issue with multiple messages emitted for the same
+# line: they can now be "expected" in the test file in any order, e.g.
+#   !ERROR: Not enough arguments in a call to foo
+#   !ERROR: `foo` is a subroutine, not a function
+#   a = foo()
+# has the same effect as:
+#   !ERROR: `foo` is a subroutine, not a function
+#   !ERROR: Not enough arguments in a call to foo
+#   a = foo()
+
+
+def join_per_line_map(m):
+    """Take a map {"line_no:": [message1, message2, ...], ...} and convert
+    it into a newline-separated string that follows the line ordering.
+    """
+    # Sort messages for each line, and prepend the line number to each
+    # message. Use numeric values of line numbers as keys to allow them
+    # to be sorted numerically.
+    sorted_lines_map = {
+        int(k.rstrip(":")): [k + s for s in sorted(m[k])] for k in m.keys()
+    }
+
+    joined_lines_list = []
+    for line in sorted(sorted_lines_map.keys()):
+        joined_lines_list.append("\n".join(sorted_lines_map[line]))
+    return "\n".join(joined_lines_list)
+
+
 cm.check_args(sys.argv)
 srcdir = cm.set_source(sys.argv[1])
 with open(srcdir, "r", encoding="utf-8") as f:
     src = f.readlines()
-actual = ""
-expect = ""
 diffs = ""
 log = ""
 
@@ -48,14 +83,19 @@
             sys.exit(1)
 
 # Cleans up the output from the compilation process to be easier to process
+actual_per_line = dict()
 for line in log.split("\n"):
     m = re.search(r"[^:]*:(\d+:).*(?:error|warning|portability|because):(.*)", line)
     if m:
         if re.search(r"warning: .*fold.*host", line):
             continue  # ignore host-dependent folding warnings
-        actual += m.expand(r"\1\2\n")
+        line_colon = m.expand(r"\1")
+        actual_per_line[line_colon] = actual_per_line.get(line_colon, []) + [
+            m.expand(r"\2")
+        ]
 
 # Gets the expected errors and their line numbers
+expect_per_line = dict()
 errors = []
 for i, line in enumerate(src, 1):
     m = re.search(r"(?:^\s*!\s*(?:ERROR|WARNING|PORTABILITY|BECAUSE): )(.*)", line)
@@ -63,10 +103,12 @@
         errors.append(m.group(1))
         continue
     if errors:
-        for x in errors:
-            expect += f"{i}: {x}\n"
+        expect_per_line[f"{i}:"] = [f" {x}" for x in errors]
         errors = []
 
+actual = join_per_line_map(actual_per_line)
+expect = join_per_line_map(expect_per_line)
+
 # Compares the expected errors with the compiler errors
 for line in unified_diff(actual.split("\n"), expect.split("\n"), n=0):
     line = re.sub(r"(^\-)(\d+:)", r"\nactual at \g<2>", line)
diff --git a/libc/Maintainers.rst b/libc/Maintainers.rst
index b17d2560fc2c1..efb585095b63b 100644
--- a/libc/Maintainers.rst
+++ b/libc/Maintainers.rst
@@ -37,6 +37,7 @@ Math
 ----
 | Tue Ly
 | lntue\@google.com (email), `lntue <https://github.com/lntue>`_ (github)
+
 | Nicolas Celik
 | its.overmighty\@gmail.com (email), `OverMighty <https://github.com/overmighty>`_ (github)
 
@@ -45,6 +46,11 @@ Threading
 | Yifan Zhu
 | yifanzhu\@rochester.edu (email), `Schrodinger ZHU Yifan <https://github.com/schrodingerzhu>`_ (github)
 
+Runtime Safety, Threading, Math
+-------------------------------
+| Muhammad Bassiouni
+| muhammad.m.bassiouni\@gmail.com (email), `bassiounix <https://github.com/bassiounix>`_ (github)
+
 RISC-V
 ------
 | Mikhail R. Gadelha
@@ -55,6 +61,11 @@ Public Headers / hdrgen
 | Roland McGrath
 | mcgrathr\@google.com (email), `frobtech <https://github.com/frobtech>`_ (github)
 
+General Maintenance and Documentation
+-------------------------------------
+| Jeff Bailey
+| jbailey\@raspberryginger.com (email), `kaladron <https://github.com/kaladron>`_ (github), kaladron (discourse), kaladron725 (discord)
+
 
 Inactive Maintainers
 ====================
diff --git a/libc/docs/contributing.rst b/libc/docs/contributing.rst
index a674290cf6dc0..eea349c45b5cc 100644
--- a/libc/docs/contributing.rst
+++ b/libc/docs/contributing.rst
@@ -38,10 +38,11 @@ a list of open projects that one can start with:
    implemented.
 
 #. **Update the clang-tidy lint rules and use them in the build and/or CI** -
-   Currently, the :ref:`clang_tidy_checks` have gone stale and are mostly unused
-   by the developers and on the CI builders. This project is about updating
-   them and reintegrating them back with the build and running them on the
-   CI builders.
+   The libc project has a set of clang-tidy checks (see :ref:`clang_tidy_checks`)
+   but they are not enabled by default. They can be enabled by configuring with
+   ``-DLLVM_LIBC_ENABLE_LINTING=ON`` (or by setting ``LLVM_LIBC_CLANG_TIDY``) and
+   running the ``libc-lint`` build target. This project is about keeping the
+   checks up to date and reintegrating them into the build and CI.
 
 #. **double and higher precision math functions** - These are under active
    development but you can take a shot at those not yet implemented. See
diff --git a/libc/docs/dev/clang_tidy_checks.rst b/libc/docs/dev/clang_tidy_checks.rst
index 91d415a2e0d47..1ef2f60a472b1 100644
--- a/libc/docs/dev/clang_tidy_checks.rst
+++ b/libc/docs/dev/clang_tidy_checks.rst
@@ -3,17 +3,25 @@
 LLVM libc clang-tidy checks
 ===========================
 
+Configuration
+-------------
 
-.. warning::
-  This page is severely out of date. Much of the information it contains may be
-  incorrect. Please only remove this warning once the page has been updated.
+LLVM libc uses layered ``.clang-tidy`` configuration files:
 
-These are the clang-tidy checks designed to help enforce implementation
-standards.
-The configuration file is ``src/.clang-tidy``.
+- ``libc/.clang-tidy``: baseline checks for the ``libc`` subtree (currently
+  focuses on identifier naming conventions).
+- ``libc/src/.clang-tidy``: adds LLVM-libc-specific checks (``llvmlibc-*``) for
+  implementation code under ``libc/src`` and also enables
+  ``readability-identifier-naming`` and ``llvm-header-guard``. Diagnostics from
+  ``llvmlibc-*`` checks are treated as errors.
+
+LLVM-libc checks
+----------------
+
+restrict-system-libc-headers
+----------------------------
+Check name: ``llvmlibc-restrict-system-libc-headers``.
 
-restrict-system-libc-header
----------------------------
 One of libc-project’s design goals is to use kernel headers and compiler
 provided headers to prevent code duplication on a per platform basis. This
 presents a problem when writing implementations since system libc headers are
@@ -31,17 +39,17 @@ libc implementation.
    #include <stddef.h>           // Allowed because it is provided by the compiler.
    #include "internal/stdio.h"   // Allowed because it is NOT part of system libc.
 
-
 implementation-in-namespace
 ---------------------------
+Check name: ``llvmlibc-implementation-in-namespace``.
 
 It is part of our implementation standards that all implementation pieces live
 under the ``LIBC_NAMESPACE_DECL`` namespace. This prevents pollution of the
 global namespace. Without a formal check to ensure this, an implementation
 might compile and pass unit tests, but not produce a usable libc function.
 
-This check that ensures any function call resolves to a function within the
-``LIBC_NAMESPACE_DECL`` namespace.
+This check ensures that top-level declarations in a translation unit are
+enclosed within the ``LIBC_NAMESPACE_DECL`` namespace.
 
 .. code-block:: c++
 
@@ -64,14 +72,10 @@ This check that ensures any function call resolves to a function within the
         void LLVM_LIBC_ENTRYPOINT(strcpy)(char *dest, const char *src) {}
     }
 
-..
-  TODO(97655): The clang-tidy check should be updated to ensure the namespace
-  declaration uses LIBC_NAMESPACE_DECL as opposed to LIBC_NAMESPACE. The former
-  should be used for accessing globals in LIBC_NAMESPACE rather than declaration.
-
-
 callee-namespace
 ----------------
+Check name: ``llvmlibc-callee-namespace``.
+
 LLVM-libc is distinct because it is designed to maintain interoperability with
 other libc libraries, including the one that lives on the system. This feature
 creates some uncertainty about which library a call resolves to especially when
@@ -105,3 +109,12 @@ are always external and can be intercepted.
     ::malloc(10);
 
     } // namespace LIBC_NAMESPACE_DECL
+
+
+inline-function-decl
+--------------------
+Check name: ``llvmlibc-inline-function-decl``.
+
+LLVM libc uses the ``LIBC_INLINE`` macro to tag inline function declarations in
+headers. This check enforces that any inline function declaration in a header
+begins with ``LIBC_INLINE`` and provides a fix-it to insert the macro.
diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst
index f88d82feea367..7c53ad30a5d4d 100644
--- a/libc/docs/dev/code_style.rst
+++ b/libc/docs/dev/code_style.rst
@@ -289,7 +289,3 @@ Example usage:
 Having hidden visibility on the namespace ensures extern declarations in a given TU
 have known visibility and never generate GOT indirections. The attribute guarantees
 this independently of global compile options and build systems.
-
-..
-  TODO(97655): We should have a clang-tidy check to enforce this and a
-  fixit implementation.
diff --git a/libc/fuzzing/__support/freelist_heap_fuzz.cpp b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
index b342b21895a08..3675e6c6b7adc 100644
--- a/libc/fuzzing/__support/freelist_heap_fuzz.cpp
+++ b/libc/fuzzing/__support/freelist_heap_fuzz.cpp
@@ -26,6 +26,7 @@ asm(R"(
 __llvm_libc_heap_limit:
 )");
 
+using LIBC_NAMESPACE::Block;
 using LIBC_NAMESPACE::FreeListHeap;
 using LIBC_NAMESPACE::inline_memset;
 using LIBC_NAMESPACE::cpp::nullopt;
diff --git a/libc/fuzzing/string/strlen_fuzz.cpp b/libc/fuzzing/string/strlen_fuzz.cpp
index dd72c19b7fdc7..01b6bd14db230 100644
--- a/libc/fuzzing/string/strlen_fuzz.cpp
+++ b/libc/fuzzing/string/strlen_fuzz.cpp
@@ -11,8 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/string/strlen.h"
-#include <cstdint>
-#include <cstring>
+#include <stdint.h>
+#include <string.h>
 
 // always null terminate the data
 extern "C" size_t LLVMFuzzerMutate(uint8_t *data, size_t size, size_t max_size);
diff --git a/libc/hdr/func/aligned_alloc.h b/libc/hdr/func/aligned_alloc.h
index b3436dfee1f23..3c197f4671a76 100644
--- a/libc/hdr/func/aligned_alloc.h
+++ b/libc/hdr/func/aligned_alloc.h
@@ -11,7 +11,11 @@
 
 #ifdef LIBC_FULL_BUILD
 #include "hdr/types/size_t.h"
+#ifdef __cplusplus
+extern "C" void *aligned_alloc(size_t, size_t) noexcept;
+#else
 extern "C" void *aligned_alloc(size_t, size_t);
+#endif
 
 #else // Overlay mode
 
diff --git a/libc/hdr/stdint_proxy.h b/libc/hdr/stdint_proxy.h
index 8e815679a4e24..d5c600d5a28bf 100644
--- a/libc/hdr/stdint_proxy.h
+++ b/libc/hdr/stdint_proxy.h
@@ -13,6 +13,10 @@
 // that is `libc.include.stdint` is added to the dependency of all targets
 // that use <stdint.h> header.
 
+#ifdef LIBC_FULL_BUILD
+#include "include/llvm-libc-macros/stdint-macros.h"
+#else
 #include <stdint.h>
+#endif
 
 #endif // LLVM_LIBC_HDR_STDINT_PROXY_H
diff --git a/libc/shared/math.h b/libc/shared/math.h
index 61c3ddffd74e5..c110f6a696b79 100644
--- a/libc/shared/math.h
+++ b/libc/shared/math.h
@@ -34,6 +34,7 @@
 #include "math/atanf16.h"
 #include "math/atanhf.h"
 #include "math/atanhf16.h"
+#include "math/atanpif16.h"
 #include "math/bf16add.h"
 #include "math/bf16addf.h"
 #include "math/bf16addf128.h"
@@ -41,6 +42,7 @@
 #include "math/bf16div.h"
 #include "math/bf16divf.h"
 #include "math/bf16divl.h"
+#include "math/bf16fma.h"
 #include "math/bf16fmaf.h"
 #include "math/bf16fmaf128.h"
 #include "math/bf16fmal.h"
@@ -180,6 +182,7 @@
 #include "math/log2.h"
 #include "math/log2f.h"
 #include "math/log2f16.h"
+#include "math/log_bf16.h"
 #include "math/logb.h"
 #include "math/logbf.h"
 #include "math/logbf128.h"
diff --git a/libc/shared/math/atanpif16.h b/libc/shared/math/atanpif16.h
new file mode 100644
index 0000000000000..c79c2979b32e5
--- /dev/null
+++ b/libc/shared/math/atanpif16.h
@@ -0,0 +1,29 @@
+//===-- Shared atanpif16 function -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_ATANPIF16_H
+#define LLVM_LIBC_SHARED_MATH_ATANPIF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "shared/libc_common.h"
+#include "src/__support/math/atanpif16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::atanpif16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SHARED_MATH_ATANPIF16_H
diff --git a/libc/shared/math/bf16fma.h b/libc/shared/math/bf16fma.h
new file mode 100644
index 0000000000000..d024062ec11ca
--- /dev/null
+++ b/libc/shared/math/bf16fma.h
@@ -0,0 +1,23 @@
+//===-- Shared bf16fma function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_BF16FMA_H
+#define LLVM_LIBC_SHARED_MATH_BF16FMA_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/bf16fma.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::bf16fma;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_BF16FMA_H
diff --git a/libc/shared/math/log_bf16.h b/libc/shared/math/log_bf16.h
new file mode 100644
index 0000000000000..d431133cfaa42
--- /dev/null
+++ b/libc/shared/math/log_bf16.h
@@ -0,0 +1,23 @@
+//===-- Shared log_bf16 function --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SHARED_MATH_LOG_BF16_H
+#define LLVM_LIBC_SHARED_MATH_LOG_BF16_H
+
+#include "shared/libc_common.h"
+#include "src/__support/math/log_bf16.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace shared {
+
+using math::log_bf16;
+
+} // namespace shared
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SHARED_MATH_LOG_BF16_H
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index cc0710fbf7b02..018429b30272a 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -219,6 +219,7 @@ template <size_t Bits> struct DyadicFloat {
       underflow = true;
     } else if (unbiased_exp == -FPBits::EXP_BIAS - FPBits::FRACTION_LEN) {
       round = true;
+      underflow = true;
       MantissaType sticky_mask = (MantissaType(1) << (Bits - 1)) - 1;
       sticky = (mantissa & sticky_mask) != 0;
     } else {
diff --git a/libc/src/__support/math/CMakeLists.txt b/libc/src/__support/math/CMakeLists.txt
index 8bb935d863a9b..a8bd09e925b38 100644
--- a/libc/src/__support/math/CMakeLists.txt
+++ b/libc/src/__support/math/CMakeLists.txt
@@ -346,6 +346,23 @@ add_header_library(
     libc.src.__support.macros.optimization
 )
 
+add_header_library(
+  atanpif16
+  HDRS
+    atanpif16.h
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.include.llvm-libc-macros.float16_macros
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.sqrt
+    libc.src.__support.macros.config
+    libc.src.__support.macros.optimization
+)
+
 add_header_library(
   asinf
   HDRS
@@ -452,6 +469,16 @@ add_header_library(
     libc.src.__support.macros.config
 )
 
+add_header_library(
+  bf16fma
+  HDRS
+    bf16fma.h
+  DEPENDS
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.fma
+    libc.src.__support.macros.config
+)
+
 add_header_library(
   bf16fmaf
   HDRS
@@ -2829,6 +2856,21 @@ add_header_library(
     libc.src.__support.uint128
 )
 
+add_header_library(
+  log_bf16
+  HDRS
+    log_bf16.h
+  DEPENDS
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.bfloat16
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.common
+    libc.src.__support.macros.config
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+)
+
 add_header_library(
   pow
   HDRS
diff --git a/libc/src/__support/math/atanpif16.h b/libc/src/__support/math/atanpif16.h
new file mode 100644
index 0000000000000..a744d8e62a79b
--- /dev/null
+++ b/libc/src/__support/math/atanpif16.h
@@ -0,0 +1,182 @@
+//===-- Implementation header for atanpif16 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_ATANPIF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_ATANPIF16_H
+
+#include "include/llvm-libc-macros/float16-macros.h"
+
+#ifdef LIBC_TYPES_HAS_FLOAT16
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+// Using Python's SymPy library, we can obtain the polynomial approximation of
+// arctan(x)/pi. The steps are as follows:
+//  >>> from sympy import *
+//  >>> import math
+//  >>> x = symbols('x')
+//  >>> print(series(atan(x)/math.pi, x, 0, 17))
+//
+// Output:
+// 0.318309886183791*x - 0.106103295394597*x**3 + 0.0636619772367581*x**5 -
+// 0.0454728408833987*x**7 + 0.0353677651315323*x**9 - 0.0289372623803446*x**11
+// + 0.0244853758602916*x**13 - 0.0212206590789194*x**15 + O(x**17)
+//
+// We will assign this degree-15 Taylor polynomial as g(x). This polynomial
+// approximation is accurate for arctan(x)/pi when |x| is in the range [0, 0.5].
+//
+//
+// To compute arctan(x) for all real x, we divide the domain into the following
+// cases:
+//
+// * Case 1: |x| <= 0.5
+//      In this range, the direct polynomial approximation is used:
+//      arctan(x)/pi = sign(x) * g(|x|)
+//      or equivalently, arctan(x) = sign(x) * pi * g(|x|).
+//
+// * Case 2: 0.5 < |x| <= 1
+//      We use the double-angle identity for the tangent function, specifically:
+//        arctan(x) = 2 * arctan(x / (1 + sqrt(1 + x^2))).
+//      Applying this, we have:
+//        arctan(x)/pi = sign(x) * 2 * arctan(x')/pi,
+//        where x' = |x| / (1 + sqrt(1 + x^2)).
+//        Thus, arctan(x)/pi = sign(x) * 2 * g(x')
+//
+//      When |x| is in (0.5, 1], the value of x' will always fall within the
+//      interval [0.207, 0.414], which is within the accurate range of g(x).
+//
+// * Case 3: |x| > 1
+//      For values of |x| greater than 1, we use the reciprocal transformation
+//      identity:
+//        arctan(x) = pi/2 - arctan(1/x) for x > 0.
+//      For any x (real number), this generalizes to:
+//        arctan(x)/pi = sign(x) * (1/2 - arctan(1/|x|)/pi).
+//      Then, using g(x) for arctan(1/|x|)/pi:
+//        arctan(x)/pi = sign(x) * (1/2 - g(1/|x|)).
+//
+//      Note that if 1/|x| still falls outside the
+//      g(x)'s primary range of accuracy (i.e., if 0.5 < 1/|x| <= 1), the rule
+//      from Case 2 must be applied recursively to 1/|x|.
+
+LIBC_INLINE float16 atanpif16(float16 x) {
+  using FPBits = fputil::FPBits<float16>;
+
+  FPBits xbits(x);
+  bool is_neg = xbits.is_neg();
+
+  auto signed_result = [is_neg](double r) -> float16 {
+    return fputil::cast<float16>(is_neg ? -r : r);
+  };
+
+  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
+    if (xbits.is_nan()) {
+      if (xbits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+      return x;
+    }
+    // atanpi(±∞) = ±0.5
+    return signed_result(0.5);
+  }
+
+  double x_abs = fputil::cast<double>(xbits.abs().get_val());
+
+  // evaluate atan(x)/pi using polynomial approximation, valid for |x| <= 0.5
+  constexpr auto atanpi_eval = [](double x) -> double {
+    // polynomial coefficients for atan(x)/pi taylor series
+    // generated using sympy: series(atan(x)/pi, x, 0, 17)
+    constexpr static double POLY_COEFFS[] = {
+        0x1.45f306dc9c889p-2,  // x^1:   1/pi
+        -0x1.b2995e7b7b60bp-4, // x^3:  -1/(3*pi)
+        0x1.04c26be3b06ccp-4,  // x^5:   1/(5*pi)
+        -0x1.7483758e69c08p-5, // x^7:  -1/(7*pi)
+        0x1.21bb945252403p-5,  // x^9:   1/(9*pi)
+        -0x1.da1bace3cc68ep-6, // x^11: -1/(11*pi)
+        0x1.912b1c2336cf2p-6,  // x^13:  1/(13*pi)
+        -0x1.5bade52f95e7p-6,  // x^15: -1/(15*pi)
+    };
+    double x_sq = x * x;
+    return x * fputil::polyeval(x_sq, POLY_COEFFS[0], POLY_COEFFS[1],
+                                POLY_COEFFS[2], POLY_COEFFS[3], POLY_COEFFS[4],
+                                POLY_COEFFS[5], POLY_COEFFS[6], POLY_COEFFS[7]);
+  };
+
+  // Case 1: |x| <= 0.5 - Direct polynomial evaluation
+  if (LIBC_LIKELY(x_abs <= 0.5)) {
+
+    if (LIBC_UNLIKELY(xbits.is_zero()))
+      return x;
+
+    if (LIBC_UNLIKELY(xbits.abs().uintval() == 0x0a48)) {
+      int rounding = fputil::quick_get_round();
+      if (!is_neg) {
+        if (rounding == FE_UPWARD)
+          return fputil::cast<float16>(0x1p-14f);
+        return fputil::cast<float16>(0x1.ffd7ap-15f);
+      } else {
+        if (rounding == FE_DOWNWARD)
+          return fputil::cast<float16>(-0x1p-14f);
+        return fputil::cast<float16>(-0x1.ffd7ap-15f);
+      }
+    }
+    double result = atanpi_eval(x_abs);
+    float16 s_result = signed_result(result);
+    return s_result;
+  }
+
+  if (LIBC_UNLIKELY(x_abs == 1.0))
+    return signed_result(0.25);
+
+  // case 2: 0.5 < |x| < 1 - use double-angle reduction
+  // atan(x) = 2 * atan(x / (1 + sqrt(1 + x^2)))
+  // so atanpi(x) = 2 * atanpi(x') where x' = x / (1 + sqrt(1 + x^2))
+  if (x_abs < 1.0) {
+    double x_abs_sq = x_abs * x_abs;
+    double sqrt_term = fputil::sqrt<double>(1.0 + x_abs_sq);
+    double x_prime = x_abs / (1.0 + sqrt_term);
+    double result = 2.0 * atanpi_eval(x_prime);
+    return signed_result(result);
+  }
+
+  // case 3: |x| > 1 - use reciprocal transformation
+  // atan(x) = pi/2 - atan(1/x) for x > 0
+  // so atanpi(x) = 1/2 - atanpi(1/x)
+  double x_recip = 1.0 / x_abs;
+  double result;
+
+  // if 1/|x| > 0.5, we need to apply Case 2 transformation to 1/|x|
+  if (x_recip > 0.5) {
+    double x_recip_sq = x_recip * x_recip;
+    double sqrt_term = fputil::sqrt<double>(1.0 + x_recip_sq);
+    double x_prime = x_recip / (1.0 + sqrt_term);
+    result = fputil::multiply_add(-2.0, atanpi_eval(x_prime), 0.5);
+  } else {
+    // direct evaluation since 1/|x| <= 0.5
+    result = 0.5 - atanpi_eval(x_recip);
+  }
+
+  return signed_result(result);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LIBC_TYPES_HAS_FLOAT16
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_ATANPIF16_H
diff --git a/libc/src/__support/math/bf16fma.h b/libc/src/__support/math/bf16fma.h
new file mode 100644
index 0000000000000..da88d5753f6bb
--- /dev/null
+++ b/libc/src/__support/math/bf16fma.h
@@ -0,0 +1,26 @@
+//===-- Implementation header for bf16fma ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMA_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMA_H
+
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace math {
+
+LIBC_INLINE bfloat16 bf16fma(double x, double y, double z) {
+  return fputil::fma<bfloat16>(x, y, z);
+}
+
+} // namespace math
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_BF16FMA_H
diff --git a/libc/src/__support/math/log_bf16.h b/libc/src/__support/math/log_bf16.h
new file mode 100644
index 0000000000000..f809fdc0750e4
--- /dev/null
+++ b/libc/src/__support/math/log_bf16.h
@@ -0,0 +1,145 @@
+//===-- Implementation header for log_bf16 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_MATH_LOG_BF16_H
+#define LLVM_LIBC_SRC___SUPPORT_MATH_LOG_BF16_H
+
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/bfloat16.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace math {
+
+LIBC_INLINE bfloat16 log_bf16(bfloat16 x) {
+
+  // Generated by Sollya with the following commands:
+  //   > display = hexadecimal;
+  //   > round(log(2), SG, RN);
+  constexpr float BF16_LOGF_2 = 0x1.62e43p-1f;
+
+  // Generated by Sollya with the following commands:
+  //   > display = hexadecimal;
+  //   > for i from 0 to 127 do print(round(log(1 + i * 2^-7), SG, RN));
+  constexpr float LOG_1_PLUS_M[128] = {
+      0x0.0p0f,       0x1.fe02a6p-8f, 0x1.fc0a8cp-7f, 0x1.7b91bp-6f,
+      0x1.f829bp-6f,  0x1.39e87cp-5f, 0x1.77459p-5f,  0x1.b42dd8p-5f,
+      0x1.f0a30cp-5f, 0x1.16536ep-4f, 0x1.341d7ap-4f, 0x1.51b074p-4f,
+      0x1.6f0d28p-4f, 0x1.8c345ep-4f, 0x1.a926d4p-4f, 0x1.c5e548p-4f,
+      0x1.e27076p-4f, 0x1.fec914p-4f, 0x1.0d77e8p-3f, 0x1.1b72aep-3f,
+      0x1.29553p-3f,  0x1.371fc2p-3f, 0x1.44d2b6p-3f, 0x1.526e5ep-3f,
+      0x1.5ff308p-3f, 0x1.6d60fep-3f, 0x1.7ab89p-3f,  0x1.87fa06p-3f,
+      0x1.9525aap-3f, 0x1.a23bc2p-3f, 0x1.af3c94p-3f, 0x1.bc2868p-3f,
+      0x1.c8ff7cp-3f, 0x1.d5c216p-3f, 0x1.e27076p-3f, 0x1.ef0adcp-3f,
+      0x1.fb9186p-3f, 0x1.04025ap-2f, 0x1.0a324ep-2f, 0x1.1058cp-2f,
+      0x1.1675cap-2f, 0x1.1c898cp-2f, 0x1.22942p-2f,  0x1.2895a2p-2f,
+      0x1.2e8e2cp-2f, 0x1.347ddap-2f, 0x1.3a64c6p-2f, 0x1.404308p-2f,
+      0x1.4618bcp-2f, 0x1.4be5fap-2f, 0x1.51aad8p-2f, 0x1.576772p-2f,
+      0x1.5d1bdcp-2f, 0x1.62c83p-2f,  0x1.686c82p-2f, 0x1.6e08eap-2f,
+      0x1.739d8p-2f,  0x1.792a56p-2f, 0x1.7eaf84p-2f, 0x1.842d1ep-2f,
+      0x1.89a338p-2f, 0x1.8f11e8p-2f, 0x1.947942p-2f, 0x1.99d958p-2f,
+      0x1.9f323ep-2f, 0x1.a4840ap-2f, 0x1.a9cecap-2f, 0x1.af1294p-2f,
+      0x1.b44f78p-2f, 0x1.b9858ap-2f, 0x1.beb4dap-2f, 0x1.c3dd7ap-2f,
+      0x1.c8ff7cp-2f, 0x1.ce1afp-2f,  0x1.d32fe8p-2f, 0x1.d83e72p-2f,
+      0x1.dd46ap-2f,  0x1.e24882p-2f, 0x1.e74426p-2f, 0x1.ec399ep-2f,
+      0x1.f128f6p-2f, 0x1.f6124p-2f,  0x1.faf588p-2f, 0x1.ffd2ep-2f,
+      0x1.02552ap-1f, 0x1.04bdfap-1f, 0x1.0723e6p-1f, 0x1.0986f4p-1f,
+      0x1.0be72ep-1f, 0x1.0e4498p-1f, 0x1.109f3ap-1f, 0x1.12f71ap-1f,
+      0x1.154c3ep-1f, 0x1.179eacp-1f, 0x1.19ee6cp-1f, 0x1.1c3b82p-1f,
+      0x1.1e85f6p-1f, 0x1.20cdcep-1f, 0x1.23130ep-1f, 0x1.2555bcp-1f,
+      0x1.2795e2p-1f, 0x1.29d38p-1f,  0x1.2c0e9ep-1f, 0x1.2e4744p-1f,
+      0x1.307d74p-1f, 0x1.32b134p-1f, 0x1.34e28ap-1f, 0x1.37117cp-1f,
+      0x1.393e0ep-1f, 0x1.3b6844p-1f, 0x1.3d9026p-1f, 0x1.3fb5b8p-1f,
+      0x1.41d8fep-1f, 0x1.43f9fep-1f, 0x1.4618bcp-1f, 0x1.48353ep-1f,
+      0x1.4a4f86p-1f, 0x1.4c679ap-1f, 0x1.4e7d82p-1f, 0x1.50913cp-1f,
+      0x1.52a2d2p-1f, 0x1.54b246p-1f, 0x1.56bf9ep-1f, 0x1.58cadcp-1f,
+      0x1.5ad404p-1f, 0x1.5cdb1ep-1f, 0x1.5ee02ap-1f, 0x1.60e33p-1f,
+  };
+  using FPBits = fputil::FPBits<bfloat16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+
+  // If x <= 0, or x is 1, or x is +inf, or x is NaN.
+  if (LIBC_UNLIKELY(x_u == 0U || x_u == 0x3f80U || x_u >= 0x7f80U)) {
+    // log(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // log(+/-0) = −inf
+    if ((x_u & 0x7fffU) == 0U) {
+      fputil::raise_except_if_required(FE_DIVBYZERO);
+      return FPBits::inf(Sign::NEG).get_val();
+    }
+
+    // log(1) = 0
+    if (x_u == 0x3f80U)
+      return FPBits::zero().get_val();
+
+    // x < 0
+    if (x_u > 0x8000U) {
+      fputil::set_errno_if_required(EDOM);
+      fputil::raise_except_if_required(FE_INVALID);
+      return FPBits::quiet_nan().get_val();
+    }
+
+    // log(+inf) = +inf
+    return FPBits::inf().get_val();
+  }
+
+#ifndef LIBC_TARGET_CPU_HAS_FMA
+  // log(0.00000000000000171390679426508540927898138761520386)
+  //     ~= -34.00000095
+  if (LIBC_UNLIKELY(x_u == 0x26F7U))
+    return bfloat16(-34.0000009);
+#endif // LIBC_TARGET_CPU_HAS_FMA
+
+  int e = -FPBits::EXP_BIAS;
+
+  // When x is subnormal, normalize it.
+  if ((x_u & FPBits::EXP_MASK) == 0U) {
+    // Can't pass an integer to fputil::cast directly.
+    constexpr float NORMALIZE_EXP = 1U << FPBits::FRACTION_LEN;
+    x_bits = FPBits(x_bits.get_val() * fputil::cast<bfloat16>(NORMALIZE_EXP));
+    x_u = x_bits.uintval();
+    e -= FPBits::FRACTION_LEN;
+  }
+
+  // To compute log(x), we perform the following range reduction:
+  //   x = 2^e * (1 + m),
+  //   log(x) = e * log(2) + log(1 + m).
+  // for BFloat16, mantissa is at most 7 explicit bits, so we lookup
+  // log(1 + m) in LOG_1_PLUS_M table using `m` as key.
+
+  // Get the 7-bit mantissa directly as the table index
+  uint16_t m = x_bits.get_mantissa();
+
+  // Get unbiased exponent
+  e += x_u >> FPBits::FRACTION_LEN;
+
+  return fputil::cast<bfloat16>(fputil::multiply_add(
+      static_cast<float>(e), BF16_LOGF_2, LOG_1_PLUS_M[m]));
+}
+
+} // namespace math
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_MATH_LOG_BF16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 07c48c9a04c98..1c41a8f4980b0 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1947,13 +1947,7 @@ add_entrypoint_object(
   HDRS
     ../log_bf16.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.macros.config
-    libc.src.__support.macros.optimization
+    libc.src.__support.math.log_bf16
 )
 
 add_entrypoint_object(
@@ -4076,16 +4070,7 @@ add_entrypoint_object(
   HDRS
     ../atanpif16.h
   DEPENDS
-    libc.hdr.errno_macros
-    libc.hdr.fenv_macros
-    libc.src.__support.FPUtil.cast
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.multiply_add
-    libc.src.__support.FPUtil.polyeval
-    libc.src.__support.FPUtil.sqrt
-    libc.src.__support.macros.optimization
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.atanpif16
 )
 
 add_entrypoint_object(
@@ -5155,11 +5140,7 @@ add_entrypoint_object(
   HDRS
     ../bf16fma.h
   DEPENDS
-    libc.src.__support.common
-    libc.src.__support.FPUtil.bfloat16
-    libc.src.__support.FPUtil.fma
-    libc.src.__support.macros.config
-    libc.src.__support.macros.properties.types
+    libc.src.__support.math.bf16fma
 )
 
 add_entrypoint_object(
diff --git a/libc/src/math/generic/atanpif16.cpp b/libc/src/math/generic/atanpif16.cpp
index 52cff4606608c..c6b7355ebc8f1 100644
--- a/libc/src/math/generic/atanpif16.cpp
+++ b/libc/src/math/generic/atanpif16.cpp
@@ -7,165 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanpif16.h"
-#include "hdr/errno_macros.h"
-#include "hdr/fenv_macros.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/FPUtil/sqrt.h"
-#include "src/__support/macros/optimization.h"
+#include "src/__support/math/atanpif16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// Using Python's SymPy library, we can obtain the polynomial approximation of
-// arctan(x)/pi. The steps are as follows:
-//  >>> from sympy import *
-//  >>> import math
-//  >>> x = symbols('x')
-//  >>> print(series(atan(x)/math.pi, x, 0, 17))
-//
-// Output:
-// 0.318309886183791*x - 0.106103295394597*x**3 + 0.0636619772367581*x**5 -
-// 0.0454728408833987*x**7 + 0.0353677651315323*x**9 - 0.0289372623803446*x**11
-// + 0.0244853758602916*x**13 - 0.0212206590789194*x**15 + O(x**17)
-//
-// We will assign this degree-15 Taylor polynomial as g(x). This polynomial
-// approximation is accurate for arctan(x)/pi when |x| is in the range [0, 0.5].
-//
-//
-// To compute arctan(x) for all real x, we divide the domain into the following
-// cases:
-//
-// * Case 1: |x| <= 0.5
-//      In this range, the direct polynomial approximation is used:
-//      arctan(x)/pi = sign(x) * g(|x|)
-//      or equivalently, arctan(x) = sign(x) * pi * g(|x|).
-//
-// * Case 2: 0.5 < |x| <= 1
-//      We use the double-angle identity for the tangent function, specifically:
-//        arctan(x) = 2 * arctan(x / (1 + sqrt(1 + x^2))).
-//      Applying this, we have:
-//        arctan(x)/pi = sign(x) * 2 * arctan(x')/pi,
-//        where x' = |x| / (1 + sqrt(1 + x^2)).
-//        Thus, arctan(x)/pi = sign(x) * 2 * g(x')
-//
-//      When |x| is in (0.5, 1], the value of x' will always fall within the
-//      interval [0.207, 0.414], which is within the accurate range of g(x).
-//
-// * Case 3: |x| > 1
-//      For values of |x| greater than 1, we use the reciprocal transformation
-//      identity:
-//        arctan(x) = pi/2 - arctan(1/x) for x > 0.
-//      For any x (real number), this generalizes to:
-//        arctan(x)/pi = sign(x) * (1/2 - arctan(1/|x|)/pi).
-//      Then, using g(x) for arctan(1/|x|)/pi:
-//        arctan(x)/pi = sign(x) * (1/2 - g(1/|x|)).
-//
-//      Note that if 1/|x| still falls outside the
-//      g(x)'s primary range of accuracy (i.e., if 0.5 < 1/|x| <= 1), the rule
-//      from Case 2 must be applied recursively to 1/|x|.
-
 LLVM_LIBC_FUNCTION(float16, atanpif16, (float16 x)) {
-  using FPBits = fputil::FPBits<float16>;
-
-  FPBits xbits(x);
-  bool is_neg = xbits.is_neg();
-
-  auto signed_result = [is_neg](double r) -> float16 {
-    return fputil::cast<float16>(is_neg ? -r : r);
-  };
-
-  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
-    if (xbits.is_nan()) {
-      if (xbits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-      return x;
-    }
-    // atanpi(±∞) = ±0.5
-    return signed_result(0.5);
-  }
-
-  double x_abs = fputil::cast<double>(xbits.abs().get_val());
-
-  // evaluate atan(x)/pi using polynomial approximation, valid for |x| <= 0.5
-  constexpr auto atanpi_eval = [](double x) -> double {
-    // polynomial coefficients for atan(x)/pi taylor series
-    // generated using sympy: series(atan(x)/pi, x, 0, 17)
-    constexpr static double POLY_COEFFS[] = {
-        0x1.45f306dc9c889p-2,  // x^1:   1/pi
-        -0x1.b2995e7b7b60bp-4, // x^3:  -1/(3*pi)
-        0x1.04c26be3b06ccp-4,  // x^5:   1/(5*pi)
-        -0x1.7483758e69c08p-5, // x^7:  -1/(7*pi)
-        0x1.21bb945252403p-5,  // x^9:   1/(9*pi)
-        -0x1.da1bace3cc68ep-6, // x^11: -1/(11*pi)
-        0x1.912b1c2336cf2p-6,  // x^13:  1/(13*pi)
-        -0x1.5bade52f95e7p-6,  // x^15: -1/(15*pi)
-    };
-    double x_sq = x * x;
-    return x * fputil::polyeval(x_sq, POLY_COEFFS[0], POLY_COEFFS[1],
-                                POLY_COEFFS[2], POLY_COEFFS[3], POLY_COEFFS[4],
-                                POLY_COEFFS[5], POLY_COEFFS[6], POLY_COEFFS[7]);
-  };
-
-  // Case 1: |x| <= 0.5 - Direct polynomial evaluation
-  if (LIBC_LIKELY(x_abs <= 0.5)) {
-
-    if (LIBC_UNLIKELY(xbits.is_zero()))
-      return x;
-
-    if (LIBC_UNLIKELY(xbits.abs().uintval() == 0x0a48)) {
-      int rounding = fputil::quick_get_round();
-      if (!is_neg) {
-        if (rounding == FE_UPWARD)
-          return fputil::cast<float16>(0x1p-14f);
-        return fputil::cast<float16>(0x1.ffd7ap-15f);
-      } else {
-        if (rounding == FE_DOWNWARD)
-          return fputil::cast<float16>(-0x1p-14f);
-        return fputil::cast<float16>(-0x1.ffd7ap-15f);
-      }
-    }
-    double result = atanpi_eval(x_abs);
-    float16 s_result = signed_result(result);
-    return s_result;
-  }
-
-  if (LIBC_UNLIKELY(x_abs == 1.0))
-    return signed_result(0.25);
-
-  // case 2: 0.5 < |x| < 1 - use double-angle reduction
-  // atan(x) = 2 * atan(x / (1 + sqrt(1 + x^2)))
-  // so atanpi(x) = 2 * atanpi(x') where x' = x / (1 + sqrt(1 + x^2))
-  if (x_abs < 1.0) {
-    double x_abs_sq = x_abs * x_abs;
-    double sqrt_term = fputil::sqrt<double>(1.0 + x_abs_sq);
-    double x_prime = x_abs / (1.0 + sqrt_term);
-    double result = 2.0 * atanpi_eval(x_prime);
-    return signed_result(result);
-  }
-
-  // case 3: |x| > 1 - use reciprocal transformation
-  // atan(x) = pi/2 - atan(1/x) for x > 0
-  // so atanpi(x) = 1/2 - atanpi(1/x)
-  double x_recip = 1.0 / x_abs;
-  double result;
-
-  // if 1/|x| > 0.5, we need to apply Case 2 transformation to 1/|x|
-  if (x_recip > 0.5) {
-    double x_recip_sq = x_recip * x_recip;
-    double sqrt_term = fputil::sqrt<double>(1.0 + x_recip_sq);
-    double x_prime = x_recip / (1.0 + sqrt_term);
-    result = fputil::multiply_add(-2.0, atanpi_eval(x_prime), 0.5);
-  } else {
-    // direct evaluation since 1/|x| <= 0.5
-    result = 0.5 - atanpi_eval(x_recip);
-  }
-
-  return signed_result(result);
+  return math::atanpif16(x);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/bf16fma.cpp b/libc/src/math/generic/bf16fma.cpp
index 0f0fe8658dde0..a9719abc38c6d 100644
--- a/libc/src/math/generic/bf16fma.cpp
+++ b/libc/src/math/generic/bf16fma.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/bf16fma.h"
-#include "src/__support/FPUtil/FMA.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
+#include "src/__support/math/bf16fma.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(bfloat16, bf16fma, (double x, double y, double z)) {
-  return fputil::fma<bfloat16>(x, y, z);
+  return math::bf16fma(x, y, z);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/log_bf16.cpp b/libc/src/math/generic/log_bf16.cpp
index 213dccca0fb9e..53a02320328b2 100644
--- a/libc/src/math/generic/log_bf16.cpp
+++ b/libc/src/math/generic/log_bf16.cpp
@@ -7,131 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/log_bf16.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/bfloat16.h"
-#include "src/__support/FPUtil/cast.h"
-#include "src/__support/FPUtil/multiply_add.h"
-#include "src/__support/common.h"
-#include "src/__support/macros/config.h"
-#include "src/__support/macros/optimization.h"
-#include "src/__support/macros/properties/cpu_features.h"
+#include "src/__support/math/log_bf16.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > round(log(2), SG, RN);
-static constexpr float LOGF_2 = 0x1.62e43p-1f;
-
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > for i from 0 to 127 do print(round(log(1 + i * 2^-7), SG, RN));
-static constexpr float LOG_1_PLUS_M[128] = {
-    0x0.0p0f,       0x1.fe02a6p-8f, 0x1.fc0a8cp-7f, 0x1.7b91bp-6f,
-    0x1.f829bp-6f,  0x1.39e87cp-5f, 0x1.77459p-5f,  0x1.b42dd8p-5f,
-    0x1.f0a30cp-5f, 0x1.16536ep-4f, 0x1.341d7ap-4f, 0x1.51b074p-4f,
-    0x1.6f0d28p-4f, 0x1.8c345ep-4f, 0x1.a926d4p-4f, 0x1.c5e548p-4f,
-    0x1.e27076p-4f, 0x1.fec914p-4f, 0x1.0d77e8p-3f, 0x1.1b72aep-3f,
-    0x1.29553p-3f,  0x1.371fc2p-3f, 0x1.44d2b6p-3f, 0x1.526e5ep-3f,
-    0x1.5ff308p-3f, 0x1.6d60fep-3f, 0x1.7ab89p-3f,  0x1.87fa06p-3f,
-    0x1.9525aap-3f, 0x1.a23bc2p-3f, 0x1.af3c94p-3f, 0x1.bc2868p-3f,
-    0x1.c8ff7cp-3f, 0x1.d5c216p-3f, 0x1.e27076p-3f, 0x1.ef0adcp-3f,
-    0x1.fb9186p-3f, 0x1.04025ap-2f, 0x1.0a324ep-2f, 0x1.1058cp-2f,
-    0x1.1675cap-2f, 0x1.1c898cp-2f, 0x1.22942p-2f,  0x1.2895a2p-2f,
-    0x1.2e8e2cp-2f, 0x1.347ddap-2f, 0x1.3a64c6p-2f, 0x1.404308p-2f,
-    0x1.4618bcp-2f, 0x1.4be5fap-2f, 0x1.51aad8p-2f, 0x1.576772p-2f,
-    0x1.5d1bdcp-2f, 0x1.62c83p-2f,  0x1.686c82p-2f, 0x1.6e08eap-2f,
-    0x1.739d8p-2f,  0x1.792a56p-2f, 0x1.7eaf84p-2f, 0x1.842d1ep-2f,
-    0x1.89a338p-2f, 0x1.8f11e8p-2f, 0x1.947942p-2f, 0x1.99d958p-2f,
-    0x1.9f323ep-2f, 0x1.a4840ap-2f, 0x1.a9cecap-2f, 0x1.af1294p-2f,
-    0x1.b44f78p-2f, 0x1.b9858ap-2f, 0x1.beb4dap-2f, 0x1.c3dd7ap-2f,
-    0x1.c8ff7cp-2f, 0x1.ce1afp-2f,  0x1.d32fe8p-2f, 0x1.d83e72p-2f,
-    0x1.dd46ap-2f,  0x1.e24882p-2f, 0x1.e74426p-2f, 0x1.ec399ep-2f,
-    0x1.f128f6p-2f, 0x1.f6124p-2f,  0x1.faf588p-2f, 0x1.ffd2ep-2f,
-    0x1.02552ap-1f, 0x1.04bdfap-1f, 0x1.0723e6p-1f, 0x1.0986f4p-1f,
-    0x1.0be72ep-1f, 0x1.0e4498p-1f, 0x1.109f3ap-1f, 0x1.12f71ap-1f,
-    0x1.154c3ep-1f, 0x1.179eacp-1f, 0x1.19ee6cp-1f, 0x1.1c3b82p-1f,
-    0x1.1e85f6p-1f, 0x1.20cdcep-1f, 0x1.23130ep-1f, 0x1.2555bcp-1f,
-    0x1.2795e2p-1f, 0x1.29d38p-1f,  0x1.2c0e9ep-1f, 0x1.2e4744p-1f,
-    0x1.307d74p-1f, 0x1.32b134p-1f, 0x1.34e28ap-1f, 0x1.37117cp-1f,
-    0x1.393e0ep-1f, 0x1.3b6844p-1f, 0x1.3d9026p-1f, 0x1.3fb5b8p-1f,
-    0x1.41d8fep-1f, 0x1.43f9fep-1f, 0x1.4618bcp-1f, 0x1.48353ep-1f,
-    0x1.4a4f86p-1f, 0x1.4c679ap-1f, 0x1.4e7d82p-1f, 0x1.50913cp-1f,
-    0x1.52a2d2p-1f, 0x1.54b246p-1f, 0x1.56bf9ep-1f, 0x1.58cadcp-1f,
-    0x1.5ad404p-1f, 0x1.5cdb1ep-1f, 0x1.5ee02ap-1f, 0x1.60e33p-1f,
-};
-
 LLVM_LIBC_FUNCTION(bfloat16, log_bf16, (bfloat16 x)) {
-  using FPBits = fputil::FPBits<bfloat16>;
-  FPBits x_bits(x);
-
-  uint16_t x_u = x_bits.uintval();
-
-  // If x <= 0, or x is 1, or x is +inf, or x is NaN.
-  if (LIBC_UNLIKELY(x_u == 0U || x_u == 0x3f80U || x_u >= 0x7f80U)) {
-    // log(NaN) = NaN
-    if (x_bits.is_nan()) {
-      if (x_bits.is_signaling_nan()) {
-        fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::quiet_nan().get_val();
-      }
-
-      return x;
-    }
-
-    // log(+/-0) = −inf
-    if ((x_u & 0x7fffU) == 0U) {
-      fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits::inf(Sign::NEG).get_val();
-    }
-
-    // log(1) = 0
-    if (x_u == 0x3f80U)
-      return FPBits::zero().get_val();
-
-    // x < 0
-    if (x_u > 0x8000U) {
-      fputil::set_errno_if_required(EDOM);
-      fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::quiet_nan().get_val();
-    }
-
-    // log(+inf) = +inf
-    return FPBits::inf().get_val();
-  }
-
-#ifndef LIBC_TARGET_CPU_HAS_FMA
-  // log(0.00000000000000171390679426508540927898138761520386)
-  //     ~= -34.00000095
-  if (LIBC_UNLIKELY(x_u == 0x26F7U))
-    return bfloat16(-34.0000009);
-#endif // LIBC_TARGET_CPU_HAS_FMA
-
-  int e = -FPBits::EXP_BIAS;
-
-  // When x is subnormal, normalize it.
-  if ((x_u & FPBits::EXP_MASK) == 0U) {
-    // Can't pass an integer to fputil::cast directly.
-    constexpr float NORMALIZE_EXP = 1U << FPBits::FRACTION_LEN;
-    x_bits = FPBits(x_bits.get_val() * fputil::cast<bfloat16>(NORMALIZE_EXP));
-    x_u = x_bits.uintval();
-    e -= FPBits::FRACTION_LEN;
-  }
-
-  // To compute log(x), we perform the following range reduction:
-  //   x = 2^e * (1 + m),
-  //   log(x) = e * log(2) + log(1 + m).
-  // for BFloat16, mantissa is at most 7 explicit bits, so we lookup
-  // log(1 + m) in LOG_1_PLUS_M table using `m` as key.
-
-  // Get the 7-bit mantissa directly as the table index
-  uint16_t m = x_bits.get_mantissa();
-
-  // Get unbiased exponent
-  e += x_u >> FPBits::FRACTION_LEN;
-
-  return fputil::cast<bfloat16>(
-      fputil::multiply_add(static_cast<float>(e), LOGF_2, LOG_1_PLUS_M[m]));
+  return math::log_bf16(x);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/shared/CMakeLists.txt b/libc/test/shared/CMakeLists.txt
index 2e351d8051bed..789286c25444f 100644
--- a/libc/test/shared/CMakeLists.txt
+++ b/libc/test/shared/CMakeLists.txt
@@ -31,6 +31,7 @@ add_fp_unittest(
     libc.src.__support.math.atanf16
     libc.src.__support.math.atanhf
     libc.src.__support.math.atanhf16
+    libc.src.__support.math.atanpif16
     libc.src.__support.math.bf16add
     libc.src.__support.math.bf16addf
     libc.src.__support.math.bf16addl
@@ -38,6 +39,7 @@ add_fp_unittest(
     libc.src.__support.math.bf16div
     libc.src.__support.math.bf16divf
     libc.src.__support.math.bf16divl
+    libc.src.__support.math.bf16fma
     libc.src.__support.math.bf16fmaf
     libc.src.__support.math.bf16fmaf128
     libc.src.__support.math.bf16fmal
@@ -176,6 +178,7 @@ add_fp_unittest(
     libc.src.__support.math.logbf128
     libc.src.__support.math.logbf16
     libc.src.__support.math.logf
+    libc.src.__support.math.log_bf16
     libc.src.__support.math.ldexpf
     libc.src.__support.math.ldexpf128
     libc.src.__support.math.ldexpf16
diff --git a/libc/test/shared/shared_math_test.cpp b/libc/test/shared/shared_math_test.cpp
index 4eb950f030a1a..c67078f4d5ac4 100644
--- a/libc/test/shared/shared_math_test.cpp
+++ b/libc/test/shared/shared_math_test.cpp
@@ -27,6 +27,7 @@ TEST(LlvmLibcSharedMathTest, AllFloat16) {
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::atan2f16(0.0f16, 0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::atanf16(0.0f16));
   EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::atanhf16(0.0f16));
+  EXPECT_FP_EQ(0x0p+0f16, LIBC_NAMESPACE::shared::atanpif16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::cosf16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::coshf16(0.0f16));
   EXPECT_FP_EQ(0x1p+0f16, LIBC_NAMESPACE::shared::cospif16(0.0f16));
@@ -437,6 +438,8 @@ TEST(LlvmLibcSharedMathTest, AllBFloat16) {
   EXPECT_FP_EQ(bfloat16(0.0),
                LIBC_NAMESPACE::shared::fmaxbf16(bfloat16(0.0), bfloat16(0.0)));
 
+  EXPECT_FP_EQ(bfloat16(10.0), LIBC_NAMESPACE::shared::bf16fma(2.0, 3.0, 4.0));
+
   bfloat16 getpayloadbf16_x = bfloat16(0.0);
   EXPECT_FP_EQ(bfloat16(-1.0),
                LIBC_NAMESPACE::shared::getpayloadbf16(&getpayloadbf16_x));
@@ -452,6 +455,9 @@ TEST(LlvmLibcSharedMathTest, AllBFloat16) {
   EXPECT_EQ(1, LIBC_NAMESPACE::shared::setpayloadsigbf16(&setpayloadsigbf16_res,
                                                          bfloat16(0.0)));
   EXPECT_FP_EQ(bfloat16(0.0), setpayloadsigbf16_res);
+
+  EXPECT_FP_EQ(bfloat16(0.0), LIBC_NAMESPACE::shared::log_bf16(bfloat16(1.0)));
+
   bfloat16 neg_min_denormal = FPBits::min_subnormal(Sign::NEG).get_val();
   EXPECT_FP_EQ(neg_min_denormal,
                LIBC_NAMESPACE::shared::nextdownbf16(bfloat16(0.0)));
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index d92e6b728b63e..12f378f7c1128 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -4719,6 +4719,7 @@ add_fp_unittest(
     atanpif16_test.cpp
   DEPENDS
     libc.src.math.atanpif16
+    libc.hdr.errno_macros
 )
 
 add_fp_unittest(
diff --git a/libc/test/src/math/smoke/atanpif16_test.cpp b/libc/test/src/math/smoke/atanpif16_test.cpp
index ffc8ad7296309..4521948973419 100644
--- a/libc/test/src/math/smoke/atanpif16_test.cpp
+++ b/libc/test/src/math/smoke/atanpif16_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "hdr/errno_macros.h"
 #include "src/math/atanpif16.h"
 #include "test/UnitTest/FPMatcher.h"
 
@@ -61,3 +62,9 @@ TEST_F(LlvmLibcAtanpif16Test, MonotonicityProperty) {
     EXPECT_TRUE(result1 < result2);
   }
 }
+
+TEST_F(LlvmLibcAtanpif16Test, Underflow) {
+  EXPECT_FP_EQ_WITH_EXCEPTION(0x1p-24f16, LIBC_NAMESPACE::atanpif16(0x1p-23f16),
+                              FE_UNDERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+}
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 1103711298ce3..77979e1b92add 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -48,7 +48,7 @@ if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DI
 
   message( STATUS "libclc LLVM version: ${LLVM_PACKAGE_VERSION}" )
 
-  foreach( tool IN ITEMS llvm-link opt )
+  foreach( tool IN ITEMS llvm-link llvm-dis opt )
     find_program( LLVM_TOOL_${tool} ${tool} PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH )
     set( ${tool}_exe ${LLVM_TOOL_${tool}} )
     set( ${tool}_target )
@@ -73,6 +73,7 @@ else()
   endif()
 
   get_host_tool_path( llvm-link LLVM_LINK llvm-link_exe llvm-link_target )
+  get_host_tool_path( llvm-dis LLVM_DIS llvm-dis_exe llvm-dis_target )
   get_host_tool_path( opt OPT opt_exe opt_target )
 
   # Setup the paths where libclc runtimes should be stored. By default, in an
diff --git a/libclc/clc/include/clc/math/clc_ep_decl.inc b/libclc/clc/include/clc/math/clc_ep_decl.inc
index d29cfdc6346ba..9bb06cba91a69 100644
--- a/libclc/clc/include/clc/math/clc_ep_decl.inc
+++ b/libclc/clc/include/clc/math/clc_ep_decl.inc
@@ -6,12 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifdef __CLC_SCALAR
+typedef struct __CLC_XCONCAT(__clc_ep_pair_, __CLC_GENTYPE) {
+  __CLC_GENTYPE lo, hi;
+} __CLC_XCONCAT(__clc_ep_pair_, __CLC_GENTYPE);
 
-#define __CLC_EP_PAIR __CLC_XCONCAT(__CLC_GENTYPE, 2)
+#define __CLC_EP_PAIR __CLC_XCONCAT(__clc_ep_pair_, __CLC_GENTYPE)
 
 _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_absv(__CLC_EP_PAIR a);
 
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_neg(__CLC_EP_PAIR a);
+
 _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR
 __clc_ep_conditional_sign_match(__CLC_EP_PAIR a, __CLC_EP_PAIR b);
 
@@ -128,4 +132,7 @@ _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_GENTYPE a);
 
 _CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a);
 
+#if __CLC_FPSIZE == 32 || __CLC_FPSIZE == 64
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR a);
+_CLC_DECL _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a);
 #endif
diff --git a/libclc/clc/include/clc/math/clc_exp2_fast.h b/libclc/clc/include/clc/math/clc_exp2_fast.h
new file mode 100644
index 0000000000000..a42e6c9b7fd48
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_exp2_fast.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_EXP2_FAST_H__
+#define __CLC_MATH_CLC_EXP2_FAST_H__
+
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_EXP2_FAST_H__
diff --git a/libclc/clc/include/clc/math/clc_log2_fast.h b/libclc/clc/include/clc/math/clc_log2_fast.h
new file mode 100644
index 0000000000000..5160afbedebf7
--- /dev/null
+++ b/libclc/clc/include/clc/math/clc_log2_fast.h
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_MATH_CLC_LOG2_FAST_H__
+#define __CLC_MATH_CLC_LOG2_FAST_H__
+
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_BODY <clc/shared/unary_decl.inc>
+
+#include <clc/math/gentype.inc>
+
+#undef __CLC_FUNCTION
+
+#endif // __CLC_MATH_CLC_LOG2_FAST_H__
diff --git a/libclc/clc/include/clc/math/clc_pow.h b/libclc/clc/include/clc/math/clc_pow.h
index 5e37e5bf6da65..f7399873a4994 100644
--- a/libclc/clc/include/clc/math/clc_pow.h
+++ b/libclc/clc/include/clc/math/clc_pow.h
@@ -11,9 +11,14 @@
 
 #define __CLC_BODY <clc/shared/binary_decl.inc>
 #define __CLC_FUNCTION __clc_pow
-
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+#define __CLC_FUNCTION __clc_pow_fast
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_POW_H__
diff --git a/libclc/clc/include/clc/math/clc_pown.h b/libclc/clc/include/clc/math/clc_pown.h
index 30628efb19001..3e2b359468b48 100644
--- a/libclc/clc/include/clc/math/clc_pown.h
+++ b/libclc/clc/include/clc/math/clc_pown.h
@@ -9,11 +9,16 @@
 #ifndef __CLC_MATH_CLC_POWN_H__
 #define __CLC_MATH_CLC_POWN_H__
 
-#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #define __CLC_FUNCTION __clc_pown
-
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pown_fast
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_POWN_H__
diff --git a/libclc/clc/include/clc/math/clc_powr.h b/libclc/clc/include/clc/math/clc_powr.h
index baa494cce6989..67c591ca6aa82 100644
--- a/libclc/clc/include/clc/math/clc_powr.h
+++ b/libclc/clc/include/clc/math/clc_powr.h
@@ -9,11 +9,16 @@
 #ifndef __CLC_MATH_CLC_POWR_H__
 #define __CLC_MATH_CLC_POWR_H__
 
-#define __CLC_BODY <clc/shared/binary_decl.inc>
 #define __CLC_FUNCTION __clc_powr
-
+#define __CLC_BODY <clc/shared/binary_decl.inc>
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_powr_fast
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_POWR_H__
diff --git a/libclc/clc/include/clc/math/clc_rootn.h b/libclc/clc/include/clc/math/clc_rootn.h
index 90a25ad52d867..26d111a8671d4 100644
--- a/libclc/clc/include/clc/math/clc_rootn.h
+++ b/libclc/clc/include/clc/math/clc_rootn.h
@@ -9,11 +9,16 @@
 #ifndef __CLC_MATH_CLC_ROOTN_H__
 #define __CLC_MATH_CLC_ROOTN_H__
 
-#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #define __CLC_FUNCTION __clc_rootn
-
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
 #include <clc/math/gentype.inc>
+#undef __CLC_FUNCTION
 
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_rootn_fast
+#define __CLC_BODY <clc/shared/binary_decl_with_int_second_arg.inc>
+#include <clc/math/gentype.inc>
 #undef __CLC_FUNCTION
+#undef __CLC_FLOAT_ONLY
 
 #endif // __CLC_MATH_CLC_ROOTN_H__
diff --git a/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc b/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
new file mode 100644
index 0000000000000..e2d84a5776c4c
--- /dev/null
+++ b/libclc/clc/include/clc/shared/binary_def_scalarize_loop.inc
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/utils.h"
+
+#ifndef __CLC_IMPL_FUNCTION
+#define __CLC_IMPL_FUNCTION __CLC_FUNCTION
+#endif
+
+#ifndef __CLC_RET_SCALAR_TYPE
+#define __CLC_RET_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG1_SCALAR_TYPE
+#define __CLC_ARG1_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#ifndef __CLC_ARG2_SCALAR_TYPE
+#define __CLC_ARG2_SCALAR_TYPE __CLC_SCALAR_GENTYPE
+#endif
+
+#define __CLC_RET_TYPE __CLC_XCONCAT(__CLC_RET_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_ARG1_TYPE __CLC_XCONCAT(__CLC_ARG1_SCALAR_TYPE, __CLC_VECSIZE)
+#define __CLC_ARG2_TYPE __CLC_XCONCAT(__CLC_ARG2_SCALAR_TYPE, __CLC_VECSIZE)
+
+#if __CLC_VECSIZE_OR_1 >= 2
+
+_CLC_OVERLOAD _CLC_DEF __CLC_RET_TYPE __CLC_FUNCTION(__CLC_ARG1_TYPE x,
+                                                     __CLC_ARG2_TYPE y) {
+  union {
+    __CLC_ARG1_TYPE vec;
+    __CLC_ARG1_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_x;
+
+  union {
+    __CLC_ARG2_TYPE vec;
+    __CLC_ARG2_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_y;
+
+  union {
+    __CLC_RET_TYPE vec;
+    __CLC_RET_SCALAR_TYPE arr[__CLC_VECSIZE_OR_1];
+  } u_result;
+
+  u_x.vec = x;
+  u_y.vec = y;
+  for (int i = 0; i < __CLC_VECSIZE_OR_1; ++i) {
+    u_result.arr[i] = __CLC_IMPL_FUNCTION(u_x.arr[i], u_y.arr[i]);
+  }
+
+  return u_result.vec;
+}
+
+#endif // __CLC_VECSIZE_OR_1 >= 2
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index 9b6c9a231ade0..daccc00b841b3 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -3,6 +3,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   address_space/clc_qualifier.cl
   math/clc_exp.cl
   math/clc_exp2.cl
+  math/clc_exp2_fast.cl
   math/clc_exp10.cl
   math/clc_frexp.cl
   math/clc_half_exp.cl
@@ -15,6 +16,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   math/clc_half_rsqrt.cl
   math/clc_half_sqrt.cl
   math/clc_ldexp.cl
+  math/clc_log2_fast.cl
   math/clc_native_exp.cl
   math/clc_native_exp2.cl
   math/clc_native_log10.cl
diff --git a/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl b/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
new file mode 100644
index 0000000000000..b73bc9f6e260b
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_exp2_fast.cl
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_MIN_VECSIZE 1
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_IMPL_FUNCTION(x) __builtin_amdgcn_exp2f(x)
+#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_FLOAT_ONLY
+
+#define __CLC_HALF_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_HALF_ONLY
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_DOUBLE_ONLY
diff --git a/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl b/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
new file mode 100644
index 0000000000000..a47fc84b26a00
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/math/clc_log2_fast.cl
@@ -0,0 +1,25 @@
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_MIN_VECSIZE 1
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_IMPL_FUNCTION(x) __builtin_amdgcn_logf(x)
+#define __CLC_BODY <clc/shared/unary_def_scalarize.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_FLOAT_ONLY
+
+#define __CLC_HALF_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_HALF_ONLY
+
+#define __CLC_DOUBLE_ONLY
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include <clc/math/gentype.inc>
+#undef __CLC_IMPL_FUNCTION
+#undef __CLC_DOUBLE_ONLY
diff --git a/libclc/clc/lib/generic/CMakeLists.txt b/libclc/clc/lib/generic/CMakeLists.txt
index 07200536328f3..bda2ec67a55c3 100644
--- a/libclc/clc/lib/generic/CMakeLists.txt
+++ b/libclc/clc/lib/generic/CMakeLists.txt
@@ -80,6 +80,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
   math/clc_erfc.cl
   math/clc_exp.cl
   math/clc_exp2.cl
+  math/clc_exp2_fast.cl
   math/clc_exp10.cl
   math/clc_exp_helper.cl
   math/clc_expm1.cl
@@ -114,6 +115,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
   math/clc_lgamma_r.cl
   math/clc_log.cl
   math/clc_log2.cl
+  math/clc_log2_fast.cl
   math/clc_log10.cl
   math/clc_log1p.cl
   math/clc_logb.cl
@@ -163,6 +165,7 @@ libclc_configure_source_list(CLC_GENERIC_SOURCES
   math/clc_tanpi.cl
   math/clc_tgamma.cl
   math/clc_trunc.cl
+  mem_fence/clc_mem_fence.cl
   misc/clc_shuffle.cl
   misc/clc_shuffle2.cl
   relational/clc_all.cl
diff --git a/libclc/clc/lib/generic/math/clc_cos.inc b/libclc/clc/lib/generic/math/clc_cos.inc
index 4b8108c086090..34cbbc69614a5 100644
--- a/libclc/clc/lib/generic/math/clc_cos.inc
+++ b/libclc/clc/lib/generic/math/clc_cos.inc
@@ -9,6 +9,8 @@
 #if __CLC_FPSIZE == 32
 
 _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_cos(__CLC_FLOATN x) {
+  x = __clc_isinf(x) ? __CLC_GENTYPE_NAN : x;
+
   __CLC_FLOATN absx = __clc_fabs(x);
 
   __CLC_FLOATN r0, r1;
@@ -18,11 +20,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_cos(__CLC_FLOATN x) {
   __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
 
   __CLC_FLOATN c = (regn & 1) != 0 ? ss : cc;
-  c = __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
-
-  c = __clc_select(c, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
-
-  return c;
+  return __CLC_AS_FLOATN(__CLC_AS_INTN(c) ^ ((regn > 1) << 31));
 }
 
 #elif __CLC_FPSIZE == 16
@@ -34,6 +32,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
 #elif __CLC_FPSIZE == 64
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
+  x = __clc_isinf(x) ? __CLC_GENTYPE_NAN : x;
+
   __CLC_GENTYPE absx = __clc_fabs(x);
 
   __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
@@ -56,8 +56,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_cos(__CLC_GENTYPE x) {
       __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? sinval : cosval);
   c ^= __CLC_CONVERT_BIT_INTN(regn > 1) << 63;
 
-  return __clc_isnan(absx) | __clc_isinf(absx) ? __CLC_GENTYPE_NAN
-                                               : __CLC_AS_GENTYPE(c);
+  return __CLC_AS_GENTYPE(c);
 }
 
 #endif
diff --git a/libclc/clc/lib/generic/math/clc_ep.cl b/libclc/clc/lib/generic/math/clc_ep.cl
index f0e3020f65f6f..56d902fd9b2aa 100644
--- a/libclc/clc/lib/generic/math/clc_ep.cl
+++ b/libclc/clc/lib/generic/math/clc_ep.cl
@@ -9,29 +9,15 @@
 #include "clc/clc_convert.h"
 #include "clc/math/clc_div_fast.h"
 #include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp.h"
 #include "clc/math/clc_fma.h"
+#include "clc/math/clc_frexp.h"
 #include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_mad.h"
 #include "clc/math/clc_recip_fast.h"
 #include "clc/math/clc_sqrt_fast.h"
 #include "clc/relational/clc_isinf.h"
 #include "clc/relational/clc_signbit.h"
 
-#ifdef cl_khr_fp16
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST static half ep_high_fp_bits(half x) {
-  return __clc_as_half((ushort)(__clc_as_ushort(x) & (ushort)0xffc0U));
-}
-#endif
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST static float ep_high_fp_bits(float x) {
-  return __clc_as_float(__clc_as_uint(x) & 0xfffff000U);
-}
-
-#ifdef cl_khr_fp64
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST static double ep_high_fp_bits(double x) {
-  return __clc_as_double(__clc_as_ulong(x) & 0xfffffffff8000000UL);
-}
-#endif
-
 #define __CLC_BODY <clc_ep.inc>
 #include <clc/math/gentype.inc>
diff --git a/libclc/clc/lib/generic/math/clc_ep.inc b/libclc/clc/lib/generic/math/clc_ep.inc
index 38fa513c46aac..fc45c892b6c80 100644
--- a/libclc/clc/lib/generic/math/clc_ep.inc
+++ b/libclc/clc/lib/generic/math/clc_ep.inc
@@ -6,8 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifdef __CLC_SCALAR
-
 #if __CLC_FPSIZE == 16
 #define __CLC_EP_USE_FMA true
 #elif __CLC_FPSIZE == 32
@@ -27,24 +25,69 @@
 
 #pragma OPENCL FP_CONTRACT OFF
 
+#if __CLC_FPSIZE == 16
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST static __CLC_GENTYPE
+ep_high_fp_bits(__CLC_GENTYPE x) {
+  return __CLC_AS_GENTYPE(__CLC_CONVERT_U_GENTYPE(
+      (__CLC_AS_U_GENTYPE(x) & (__CLC_U_GENTYPE)0xffc0U)));
+}
+
+#elif __CLC_FPSIZE == 32
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST static __CLC_GENTYPE
+ep_high_fp_bits(__CLC_GENTYPE x) {
+  return __CLC_AS_GENTYPE(__CLC_AS_U_GENTYPE(x) & 0xfffff000U);
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST static __CLC_GENTYPE
+ep_high_fp_bits(__CLC_GENTYPE x) {
+  return __CLC_AS_GENTYPE(__CLC_AS_U_GENTYPE(x) & 0xfffffffff8000000UL);
+}
+
+#endif
+
+typedef struct __CLC_XCONCAT(__clc_ep_int_pair_, __CLC_GENTYPE) {
+  __CLC_INTN lo, hi;
+} __CLC_XCONCAT(__clc_ep_int_pair_, __CLC_GENTYPE);
+
+#define __CLC_EP_INT_PAIR __CLC_XCONCAT(__clc_ep_int_pair_, __CLC_GENTYPE)
+
 _CLC_OVERLOAD
-static bool samesign(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+static __CLC_S_GENTYPE samesign(__CLC_GENTYPE x, __CLC_GENTYPE y) {
   return __clc_signbit(x) == __clc_signbit(y);
 }
 
 _CLC_DEF
 _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_absv(__CLC_EP_PAIR a) {
-  return __clc_signbit(a.hi) ? -a : a;
+  __CLC_EP_PAIR ret;
+  __CLC_EP_PAIR neg_a = __clc_ep_neg(a);
+  ret.lo = __clc_signbit(a.hi) ? neg_a.lo : a.lo;
+  ret.hi = __clc_signbit(a.hi) ? neg_a.hi : a.hi;
+  return ret;
+}
+
+_CLC_DEF
+_CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_neg(__CLC_EP_PAIR a) {
+  a.lo = -a.lo;
+  a.hi = -a.hi;
+  return a;
 }
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR
 __clc_ep_conditional_sign_match(__CLC_EP_PAIR a, __CLC_EP_PAIR b) {
-  return samesign(a.hi, b.hi) ? a : -a;
+  __CLC_S_GENTYPE ss = samesign(a.hi, b.hi);
+  __CLC_EP_PAIR neg_a = __clc_ep_neg(a);
+
+  __CLC_EP_PAIR ret = {ss ? a.lo : neg_a.lo, ss ? a.hi : neg_a.hi};
+  return ret;
 }
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR
 __clc_ep_make_pair(__CLC_GENTYPE a, __CLC_GENTYPE b) {
-  return (__CLC_EP_PAIR)(b, a);
+  __CLC_EP_PAIR ret = {b, a};
+  return ret;
 }
 
 _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR
@@ -387,5 +430,85 @@ _CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_sqrt(__CLC_EP_PAIR a) {
                            a.hi == __CLC_FP_LIT(0.0) ? __CLC_FP_LIT(0.0) : slo);
 }
 
-#undef __CLC_EP_USE_FMA
+#if __CLC_FPSIZE == 32
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
+  __CLC_GENTYPE d = x.hi == 0x1.62e430p+6f ? 0x1.0p-17f : 0.0f;
+  x.hi -= d;
+  x.lo += d;
+  __CLC_GENTYPE z = __clc_exp(x.hi);
+  __CLC_GENTYPE zz = __clc_fma(z, x.lo, z);
+  return __clc_isinf(z) ? z : zz;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_GENTYPE a) {
+  __CLC_INTN a_exp;
+  __CLC_GENTYPE m = __clc_frexp(a, &a_exp);
+  __CLC_INTN b = m < (2.0f / 3.0f);
+  m = __clc_ldexp(m, b);
+  __CLC_INTN e = a_exp - b;
+
+  __CLC_EP_PAIR x = __clc_ep_div(m - 1.0f, __clc_ep_fast_add(1.0f, m));
+  __CLC_EP_PAIR s = __clc_ep_sqr(x);
+  __CLC_GENTYPE t = s.hi;
+  __CLC_GENTYPE p = __clc_mad(t, __clc_mad(t, 0x1.ed89c2p-3f, 0x1.23e988p-2f),
+                              0x1.999bdep-2f);
+
+  // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+  __CLC_EP_PAIR r = __clc_ep_add(
+      __clc_ep_mul(__clc_ep_make_pair(__CLC_FP_LIT(0x1.62e430p-1),
+                                      __CLC_FP_LIT(-0x1.05c610p-29)),
+                   __CLC_CONVERT_GENTYPE(e)),
+      __clc_ep_fast_add(
+          __clc_ep_ldexp(x, 1),
+          __clc_ep_mul(__clc_ep_mul(s, x),
+                       __clc_ep_fast_add(
+                           __clc_ep_make_pair(__CLC_FP_LIT(0x1.555554p-1),
+                                              __CLC_FP_LIT(0x1.e72020p-29)),
+                           __clc_ep_mul(s, p)))));
+  return r;
+}
+
+#elif __CLC_FPSIZE == 64
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_ep_exp(__CLC_EP_PAIR x) {
+  __CLC_GENTYPE z = __clc_exp(x.hi);
+  __CLC_GENTYPE zz = __clc_mad(z, x.lo, z);
+  return __clc_isinf(z) ? z : zz;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_EP_PAIR __clc_ep_ln(__CLC_DOUBLEN a) {
+  __CLC_INTN a_exp;
+  __CLC_DOUBLEN m = __clc_frexp(a, &a_exp);
+  __CLC_INTN b = __CLC_CONVERT_INTN(m < __CLC_FP_LIT(2.0 / 3.0));
+  m = __clc_ldexp(m, b);
+  __CLC_INTN e = a_exp - b;
+
+  __CLC_EP_PAIR x = __clc_ep_div(m - 1.0, __clc_ep_fast_add(1.0, m));
+  __CLC_EP_PAIR s = __clc_ep_sqr(x);
+  __CLC_DOUBLEN t = s.hi;
+  __CLC_DOUBLEN p = __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+             __clc_mad(t, __clc_mad(t, __clc_mad(t, __clc_mad(t,
+                 0x1.dee674222de17p-4, 0x1.a6564968915a9p-4), 0x1.e25e43abe935ap-4), 0x1.110ef47e6c9c2p-3),
+                 0x1.3b13bcfa74449p-3), 0x1.745d171bf3c30p-3), 0x1.c71c71c7792cep-3), 0x1.24924924920dap-2),
+                 0x1.999999999999cp-2);
+
+  // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+  __CLC_EP_PAIR r = __clc_ep_add(
+      __clc_ep_mul(__clc_ep_make_pair(__CLC_FP_LIT(0x1.62e42fefa39efp-1),
+                                      __CLC_FP_LIT(0x1.abc9e3b39803fp-56)),
+                   __CLC_CONVERT_GENTYPE(e)),
+      __clc_ep_fast_add(
+          __clc_ep_ldexp(x, 1),
+          __clc_ep_mul(
+              __clc_ep_mul(s, x),
+              __clc_ep_fast_add(
+                  __clc_ep_make_pair(__CLC_FP_LIT(0x1.5555555555555p-1),
+                                     __CLC_FP_LIT(0x1.543b0d5df274dp-55)),
+                  __clc_ep_mul(s, p)))));
+  return r;
+}
+
 #endif
+
+#undef __CLC_EP_USE_FMA
diff --git a/libclc/clc/lib/generic/math/clc_exp2_fast.cl b/libclc/clc/lib/generic/math/clc_exp2_fast.cl
new file mode 100644
index 0000000000000..e09bd65d7e02a
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_exp2_fast.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+
+#define __CLC_FUNCTION __clc_exp2_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_exp2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_log2_fast.cl b/libclc/clc/lib/generic/math/clc_log2_fast.cl
new file mode 100644
index 0000000000000..2aad63967e888
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_log2_fast.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+
+#define __CLC_FUNCTION __clc_log2_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_log2
+#define __CLC_BODY <clc/shared/unary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_log_base.h b/libclc/clc/lib/generic/math/clc_log_base.h
index e7879c9dccedb..51c92fafdad2c 100644
--- a/libclc/clc/lib/generic/math/clc_log_base.h
+++ b/libclc/clc/lib/generic/math/clc_log_base.h
@@ -193,13 +193,14 @@ __clc_log(double a)
   m = __clc_ldexp(m, b);
   int e = a_exp - b;
 
-  double2 x = __clc_ep_div(m - 1.0, __clc_ep_fast_add(1.0, m));
+  __clc_ep_pair_double x = __clc_ep_div(m - 1.0, __clc_ep_fast_add(1.0, m));
   double s = x.hi * x.hi;
   double p = __clc_mad(s, __clc_mad(s, __clc_mad(s,
              __clc_mad(s, __clc_mad(s, __clc_mad(s, 0x1.3ab76bf559e2bp-3, 0x1.385386b47b09ap-3),
                0x1.7474dd7f4df2ep-3), 0x1.c71c016291751p-3),
                0x1.249249b27acf1p-2), 0x1.99999998ef7b6p-2), 0x1.5555555555780p-1);
-  double2 r = __clc_ep_fast_add(__clc_ep_ldexp(x, 1), s * x.hi * p);
+  __clc_ep_pair_double r =
+      __clc_ep_fast_add(__clc_ep_ldexp(x, 1), s * x.hi * p);
 
 #if defined COMPILING_LOG2
   r = __clc_ep_add(
diff --git a/libclc/clc/lib/generic/math/clc_pow.cl b/libclc/clc/lib/generic/math/clc_pow.cl
index 70d3d614a8d36..eba5daa1a1e41 100644
--- a/libclc/clc/lib/generic/math/clc_pow.cl
+++ b/libclc/clc/lib/generic/math/clc_pow.cl
@@ -6,16 +6,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
 
-#define __CLC_BODY <clc_pow.inc>
-#include <clc/math/gentype.inc>
+#define __CLC_COMPILING_POW
+#define __CLC_BODY "clc_pow_base.inc"
+#include "clc/math/gentype.inc"
+
+#define __CLC_FUNCTION __clc_pow
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_pow_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_pow.inc b/libclc/clc/lib/generic/math/clc_pow.inc
deleted file mode 100644
index 35cbcdae8ffff..0000000000000
--- a/libclc/clc/lib/generic/math/clc_pow.inc
+++ /dev/null
@@ -1,438 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE y) {
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = __CLC_AS_INTN(absx);
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
-  __CLC_INTN ypos = iy == ay;
-
-  /* Extra precise log calculation
-   *  First handle case that x is close to 1
-   */
-  __CLC_GENTYPE r = 1.0f - absx;
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  /* Coefficients are just 1/3, 1/4, 1/5 and 1/6 */
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  /* Computations for x not near 1 */
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  /* F - Y */
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  /* 0.693115234 */
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; /* 0.0000319461833 */
-  __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
-  __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
-  __CLC_GENTYPE l = lh + lt;
-
-  /* Select near 1 or not */
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE yt = y - yh;
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  /* Extra precise exp of ylogx */
-  /* 64/log2 : 92.332482616893657 */
-  const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
-  /* log2/64 lead: 0.0108032227 */
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  /* log2/64 tail: 0.0000272020388 */
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  /* Truncated Taylor series for e^r */
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
-  __CLC_GENTYPE sexpylogx =
-      expylogx * __CLC_AS_GENTYPE((__CLC_UINTN)0x1 << (m + 149));
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
-  expylogx =
-      __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
-                   ylogx > 0x1.62e430p+6f ||
-                       (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
-  /* Result is 0 if ylogx < -149*log2 */
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  /* Classify y:
-   *   inty = 0 means not an integer.
-   *   inty = 1 means odd integer.
-   *   inty = 2 means even integer.
-   */
-
-  __CLC_INTN yexp =
-      __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-  __CLC_INTN mask = ((__CLC_INTN)1 << (24 - yexp)) - 1;
-  __CLC_INTN yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-  __CLC_INTN inty = yodd ? 1 : 2;
-  inty = (iy & mask) != 0 ? 0 : inty;
-  inty = yexp < 1 ? 0 : inty;
-  inty = yexp > 24 ? 2 : inty;
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  /* Corner case handling */
-  ret = (!xpos && (inty == 0)) ? QNANBITPATT_SP32 : ret;
-  ret = ax < 0x3f800000 && iy == (__CLC_INTN)NINFBITPATT_SP32 ? PINFBITPATT_SP32
-                                                              : ret;
-  ret = ax > 0x3f800000 && iy == (__CLC_INTN)NINFBITPATT_SP32 ? 0 : ret;
-  ret = ax < 0x3f800000 && iy == (__CLC_INTN)PINFBITPATT_SP32 ? 0 : ret;
-  ret = ax > 0x3f800000 && iy == (__CLC_INTN)PINFBITPATT_SP32 ? PINFBITPATT_SP32
-                                                              : ret;
-  __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
-  __CLC_BIT_INTN x_is_pinf = ix == (__CLC_INTN)PINFBITPATT_SP32;
-  __CLC_INTN xinf =
-      xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
-
-  ret = ((ax == 0) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0) && !ypos && (inty != 1)) ? PINFBITPATT_SP32 : ret;
-  __CLC_INTN xzero = xpos ? (__CLC_INTN)0 : (__CLC_INTN)0x80000000;
-  ret = ((ax == 0) && ypos && (inty == 1)) ? xzero : ret;
-  ret = ((ax == 0) && ypos && (inty != 1)) ? 0 : ret;
-  ret = ((ax == 0) && (iy == (__CLC_INTN)NINFBITPATT_SP32)) ? PINFBITPATT_SP32
-                                                            : ret;
-  ret = (ix == (__CLC_INTN)0xbf800000 && ay == PINFBITPATT_SP32) ? 0x3f800000
-                                                                 : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0 : ret;
-  ret = (x_is_ninf && ypos && (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
-  ret = (x_is_ninf && ypos && (inty != 1)) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
-  ret = (x_is_pinf && !ypos) ? 0 : ret;
-  ret = (x_is_pinf && ypos) ? PINFBITPATT_SP32 : ret;
-  ret = (ax > PINFBITPATT_SP32) ? ix : ret;
-  ret = (ay > PINFBITPATT_SP32) ? iy : ret;
-  ret = ay == 0 ? 0x3f800000 : ret;
-  ret = ix == 0x3f800000 ? 0x3f800000 : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE y) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == (__CLC_INTN)-1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty;
-  {
-    __CLC_INTN yexp =
-        __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-    inty = __CLC_CONVERT_LONGN(yexp < 1 ? 0 : 2);
-    inty = __CLC_CONVERT_LONGN(yexp > 53) ? 2 : inty;
-    __CLC_LONGN mask = ((__CLC_LONGN)1L << (53 - yexp)) - 1L;
-    __CLC_LONGN inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1L : 2L;
-    inty1 = (ay & mask) != 0 ? 0 : inty1;
-    inty = __CLC_CONVERT_LONGN(!(yexp < 1) && !(yexp > 53)) ? inty1 : inty;
-  }
-
-  expv *= (inty == 1) && !xpos ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
-  __CLC_BIT_INTN y_is_ninf = uy == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN y_is_pinf = uy == (__CLC_LONGN)PINFBITPATT_DP64;
-  ret = !xpos && (inty == 0) ? QNANBITPATT_DP64 : ret;
-  ret = ax < 0x3ff0000000000000L && y_is_ninf ? PINFBITPATT_DP64 : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_ninf ? 0L : ret;
-  ret = ax < 0x3ff0000000000000L && y_is_pinf ? 0L : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_pinf ? PINFBITPATT_DP64 : ret;
-  __CLC_LONGN xinf =
-      xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
-  ret = ((ax == 0L) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0L) && !ypos && (inty != 1)) ? PINFBITPATT_DP64 : ret;
-  __CLC_LONGN xzero = xpos ? (__CLC_LONGN)0L : (__CLC_LONGN)0x8000000000000000L;
-  ret = ((ax == 0L) && ypos && (inty == 1)) ? xzero : ret;
-  ret = ((ax == 0L) && ypos && (inty != 1)) ? 0L : ret;
-  ret = ((ax == 0L) && y_is_ninf) ? PINFBITPATT_DP64 : ret;
-  ret = ((ux == (__CLC_LONGN)0xbff0000000000000L) && (ay == PINFBITPATT_DP64))
-            ? 0x3ff0000000000000L
-            : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
-                                            : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0L : ret;
-  ret =
-      (x_is_ninf && ypos && (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
-  ret =
-      (x_is_ninf && ypos && (inty != 1)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = x_is_pinf && !ypos ? 0L : ret;
-  ret = x_is_pinf && ypos ? PINFBITPATT_DP64 : ret;
-  ret = ax > PINFBITPATT_DP64 ? ux : ret;
-  ret = ay > PINFBITPATT_DP64 ? uy : ret;
-  ret = ay == 0L ? 0x3ff0000000000000L : ret;
-  ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
-                                               __CLC_GENTYPE y) {
-  return __CLC_CONVERT_GENTYPE(
-      __clc_pow(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_pow_base.inc b/libclc/clc/lib/generic/math/clc_pow_base.inc
new file mode 100644
index 0000000000000..016a506c41487
--- /dev/null
+++ b/libclc/clc/lib/generic/math/clc_pow_base.inc
@@ -0,0 +1,542 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Computes pow using log and exp
+//
+//   x^y = exp(y * log(x))
+//
+// We take care not to lose precision in the intermediate steps.
+//
+// When computing log, calculate it in splits:
+//
+//   r = f * (p_invead + p_inv_tail)
+//   r = rh + rt
+//
+// Calculate log polynomial using r, in end addition, do:
+//
+//   poly = poly + ((rh-r) + rt)
+//
+//   lth = -r
+//   ltt = ((xexp * log2_t) - poly) + logT
+//   lt = lth + ltt
+//
+//   lh = (xexp * log2_h) + logH
+//   l = lh + lt
+//
+// Calculate final log answer as gh and gt:
+//
+//   gh = l & higher-half bits
+//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
+//
+//   yh = y & higher-half bits
+//   yt = y - yh
+//
+// Before entering computation of exp:
+//
+//   vs = ((yt*gt + yt*gh) + yh*gt)
+//   v = vs + yh*gh
+//   vt = ((yh*gh - v) + vs)
+//
+// In calculation of exp, add vt to r that is used for poly.
+//
+// At the end of exp, do:
+//
+//   ((((expT * poly) + expT) + expH*poly) + expH)
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+#ifdef __CLC_COMPILING_POW
+
+_CLC_OVERLOAD _CLC_CONST static bool is_integer(__CLC_GENTYPE ay) {
+  return __clc_trunc(ay) == ay;
+}
+
+_CLC_OVERLOAD _CLC_CONST static bool is_even_integer(__CLC_GENTYPE ay) {
+  // Even integers are still integers after division by 2.
+  return is_integer(__CLC_FP_LIT(0.5) * ay);
+}
+
+_CLC_OVERLOAD _CLC_CONST static bool is_odd_integer(__CLC_GENTYPE ay) {
+  return is_integer(ay) && !is_even_integer(ay);
+}
+#endif
+
+#if __CLC_FPSIZE == 32
+
+_CLC_CONST
+static __CLC_GENTYPE fast_expylnx(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  return __clc_exp2(y * __clc_log2(ax));
+}
+
+#if defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_float(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+#endif
+
+#if defined(__CLC_COMPILING_POW)
+
+_CLC_CONST
+static __CLC_GENTYPE pow_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                               __CLC_GENTYPE expylnx) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  bool is_odd_y = is_odd_integer(y);
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+  // Now all the edge cases
+  if (x < 0.0f && !is_integer(y))
+    ret = FLT_NAN;
+
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  if (__clc_isinf(ay)) {
+    // FIXME: Missing backend optimization to save on
+    // materialization cost of mixed sign constant infinities.
+    bool y_is_neg_inf = y != ay;
+    ret = ax == 1.0f ? ax : ((ax < 1.0f) ^ y_is_neg_inf ? 0.0f : ay);
+  }
+
+  if (__clc_isinf(ax) || x == 0.0f)
+    ret = __clc_copysign((x == 0.0f) ^ (y < 0.0f) ? 0.0f : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0f);
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+                                                          __CLC_GENTYPE y) {
+  if (x == 1.0f)
+    y = 1.0f;
+  if (y == 0.0f)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = compute_expylnx_float(x, y);
+  return pow_fixup(x, y, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_pow_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  if (x == 1.0f)
+    y = 1.0f;
+  if (y == 0.0f)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+  return pow_fixup(x, y, expylnx);
+}
+
+#elif defined(__CLC_COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE powr_fixup(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                __CLC_GENTYPE expylnx) {
+  __CLC_GENTYPE ret = expylnx;
+
+  // Now all the edge cases
+  __CLC_GENTYPE iz = y < 0.0f ? __CLC_GENTYPE_INF : 0.0f;
+  __CLC_GENTYPE zi = y < 0.0f ? 0.0f : __CLC_GENTYPE_INF;
+
+  if (x == 0.0f)
+    ret = y == 0.0f ? __CLC_GENTYPE_NAN : iz;
+
+  if (x == __CLC_GENTYPE_INF && y != 0.0f)
+    ret = zi;
+
+  if (__clc_isinf(y) && x != 1.0f)
+    ret = x < 1.0f ? iz : zi;
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+                                                           __CLC_GENTYPE y) {
+  if (x < 0.0f)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE expylnx = compute_expylnx_float(x, y);
+  return powr_fixup(x, y, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_powr_fast(__CLC_GENTYPE x, __CLC_GENTYPE y) {
+  if (x < 0.0f)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+  return powr_fixup(x, y, expylnx);
+}
+
+#elif defined(__CLC_COMPILING_POWN)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_int(__CLC_GENTYPE x, __CLC_INTN ny) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_INTN nyh = ny & 0xffff0000;
+  __CLC_EP_PAIR y = __clc_ep_fast_add(__CLC_CONVERT_GENTYPE(nyh),
+                                      __CLC_CONVERT_GENTYPE(ny - nyh));
+  return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+
+_CLC_CONST
+static __CLC_GENTYPE pown_fixup(__CLC_GENTYPE x, __CLC_INTN ny,
+                                __CLC_GENTYPE expylnx) {
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+  // Now all the edge cases
+  if (__clc_isinf(x) || x == 0.0f)
+    ret = __clc_copysign((x == 0.0f) ^ (ny < 0) ? 0.0f : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0f);
+  return ret;
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+                                                           __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = compute_expylnx_int(x, ny);
+  return pown_fixup(x, ny, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown_fast(__CLC_GENTYPE x,
+                                                                __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0f;
+
+  __CLC_GENTYPE expylnx = fast_expylnx(x, __CLC_CONVERT_GENTYPE(ny));
+  return pown_fixup(x, ny, expylnx);
+}
+
+#elif defined(__CLC_COMPILING_ROOTN)
+
+// root version of compute_expylnx_int
+_CLC_CONST
+static __CLC_GENTYPE compute_exp_inverse_y_lnx_int(__CLC_GENTYPE x,
+                                                   __CLC_INTN ny) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_INTN nyh = ny & 0xffff0000;
+  __CLC_EP_PAIR y = __clc_ep_fast_add(__CLC_CONVERT_GENTYPE(nyh),
+                                      __CLC_CONVERT_GENTYPE(ny - nyh));
+  y = __clc_ep_recip(y);
+  return __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+}
+
+_CLC_CONST
+static __CLC_GENTYPE rootn_fixup(__CLC_GENTYPE x, __CLC_INTN ny,
+                                 __CLC_GENTYPE expylnx) {
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0f);
+
+  // Now all the edge cases
+  if (__clc_isinf(x) || x == 0.0f)
+    ret = __clc_copysign((x == 0.0f) ^ (ny < 0) ? 0.0f : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0f);
+
+  if ((x < 0.0f && !is_odd_y) || ny == 0)
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+_CLC_CONST
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+                                                            __CLC_INTN ny) {
+  __CLC_GENTYPE expylnx = compute_exp_inverse_y_lnx_int(x, ny);
+  return rootn_fixup(x, ny, expylnx);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_rootn_fast(__CLC_GENTYPE x, __CLC_INTN ny) {
+  __CLC_GENTYPE y = __clc_recip_fast(__CLC_CONVERT_GENTYPE(ny));
+  __CLC_GENTYPE expylnx = fast_expylnx(x, y);
+  return rootn_fixup(x, ny, expylnx);
+}
+
+#else
+#error missing function macro
+#endif
+
+#elif __CLC_FPSIZE == 64
+
+#if defined(__CLC_COMPILING_POW)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+                                                          __CLC_GENTYPE y) {
+  if (x == 1.0)
+    y = 1.0;
+  if (y == 0.0)
+    x = 1.0;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE expylnx =
+      __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+  bool is_odd_y = is_odd_integer(y);
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+  // Now all the edge cases
+  if (x < 0.0 && !is_integer(y))
+    ret = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  if (__clc_isinf(ay)) {
+    // FIXME: Missing backend optimization to save on
+    // materialization cost of mixed sign constant infinities.
+    bool y_is_neg_inf = y != ay;
+    ret = ax == 1.0 ? ax : ((ax < 1.0) ^ y_is_neg_inf ? 0.0 : ay);
+  }
+
+  if (__clc_isinf(ax) || x == 0.0)
+    ret = __clc_copysign((x == 0.0) ^ (y < 0.0) ? 0.0 : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0);
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(__CLC_COMPILING_POWR)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+                                                           __CLC_GENTYPE y) {
+  if (x < 0.0)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ret = __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(x)));
+
+  // Now all the edge cases
+  __CLC_GENTYPE iz = y < 0.0 ? __CLC_GENTYPE_INF : 0.0;
+  __CLC_GENTYPE zi = y < 0.0 ? 0.0 : __CLC_GENTYPE_INF;
+
+  if (x == 0.0)
+    ret = y == 0.0 ? __CLC_GENTYPE_NAN : iz;
+
+  if (x == __CLC_GENTYPE_INF && y != 0.0)
+    ret = zi;
+
+  if (__clc_isinf(y) && x != 1.0)
+    ret = x < 1.0 ? iz : zi;
+
+  if (y == 0.0)
+    ret = x == 0.0 || __clc_isinf(x) ? __CLC_GENTYPE_NAN : 1.0;
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(__CLC_COMPILING_POWN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+                                                           __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0;
+
+  __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE expylnx =
+      __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0)
+    ret = __clc_copysign((x == 0.0) ^ (ny < 0) ? 0.0 : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0);
+
+  return ret;
+}
+
+#elif defined(__CLC_COMPILING_ROOTN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+                                                            __CLC_INTN ny) {
+  __CLC_EP_PAIR y = __clc_ep_recip(__CLC_CONVERT_GENTYPE(ny));
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE expylnx =
+      __clc_ep_exp(__clc_ep_mul_overflow(y, __clc_ep_ln(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret = __clc_copysign(expylnx, is_odd_y ? x : 1.0);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0)
+    ret = __clc_copysign((x == 0.0) ^ (ny < 0) ? 0.0 : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0);
+
+  if ((x < 0.0 && !is_odd_y) || ny == 0)
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#else
+#error missing function macro
+#endif
+
+#elif __CLC_FPSIZE == 16
+
+#if defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
+
+_CLC_CONST
+static __CLC_GENTYPE compute_expylnx_f16(__CLC_GENTYPE ax, __CLC_GENTYPE y) {
+  __CLC_FLOATN x_float = __CLC_CONVERT_FLOATN(ax);
+  __CLC_FLOATN y_float = __CLC_CONVERT_FLOATN(y);
+  __CLC_FLOATN result = __clc_exp2_fast(y_float * __clc_log2_fast(x_float));
+  return __CLC_CONVERT_GENTYPE(result);
+}
+
+#endif // defined(__CLC_COMPILING_POW) || defined(__CLC_COMPILING_POWR)
+
+#if defined(__CLC_COMPILING_POW)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pow(__CLC_GENTYPE x,
+                                                          __CLC_GENTYPE y) {
+  if (x == 1.0h)
+    y = 1.0h;
+  if (y == 0.0h)
+    x = 1.0h;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_GENTYPE p = compute_expylnx_f16(ax, y);
+
+  bool is_odd_y = is_odd_integer(y);
+  __CLC_GENTYPE ret = __clc_copysign(p, is_odd_y ? x : 1.0h);
+
+  // Now all the edge cases
+  if (x < 0.0h && !is_integer(y))
+    ret = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ay = __clc_fabs(y);
+  if (__clc_isinf(ay)) {
+    // FIXME: Missing backend optimization to save on
+    // materialization cost of mixed sign constant infinities.
+    bool y_is_neg_inf = y != ay;
+    ret = ax == 1.0h ? ax : ((ax < 1.0h) ^ y_is_neg_inf ? 0.0h : ay);
+  }
+
+  if (__clc_isinf(ax) || x == 0.0h) {
+    ret = __clc_copysign((x == 0.0h) ^ (y < 0.0h) ? 0.0h : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0h);
+  }
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(__CLC_COMPILING_POWR)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
+                                                           __CLC_GENTYPE y) {
+  if (x < 0.0h)
+    x = __CLC_GENTYPE_NAN;
+
+  __CLC_GENTYPE ret = compute_expylnx_f16(x, y);
+
+  // Now all the edge cases
+  __CLC_GENTYPE iz = y < 0.0h ? __CLC_GENTYPE_INF : 0.0h;
+  __CLC_GENTYPE zi = y < 0.0h ? 0.0h : __CLC_GENTYPE_INF;
+
+  if (x == 0.0h)
+    ret = y == 0.0h ? __CLC_GENTYPE_NAN : iz;
+
+  if (x == __CLC_GENTYPE_INF && y != 0.0h)
+    ret = zi;
+
+  if (__clc_isinf(y) && x != 1.0h)
+    ret = x < 1.0h ? iz : zi;
+
+  if (__clc_isunordered(x, y))
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#elif defined(__CLC_COMPILING_POWN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
+                                                           __CLC_INTN ny) {
+  if (ny == 0)
+    x = 1.0h;
+
+  __CLC_GENTYPE ax = __clc_fabs(x);
+  __CLC_FLOATN fy = __CLC_CONVERT_FLOATN(ny);
+  __CLC_FLOATN p =
+      __clc_exp2_fast(fy * __clc_log2_fast(__CLC_CONVERT_FLOATN(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret =
+      __clc_copysign(__CLC_CONVERT_GENTYPE(p), is_odd_y ? x : 1.0h);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0h)
+    ret = __clc_copysign((x == 0.0h) ^ (ny < 0) ? 0.0h : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0h);
+
+  return ret;
+}
+
+#elif defined(__CLC_COMPILING_ROOTN)
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
+                                                            __CLC_INTN ny) {
+  __CLC_GENTYPE ax = __clc_fabs(x);
+
+  __CLC_FLOATN fy = __clc_recip_fast(__CLC_CONVERT_FLOATN(ny));
+
+  __CLC_FLOATN p =
+      __clc_exp2_fast(fy * __clc_log2_fast(__CLC_CONVERT_FLOATN(ax)));
+
+  bool is_odd_y = ny & 1;
+
+  __CLC_GENTYPE ret =
+      __clc_copysign(__CLC_CONVERT_GENTYPE(p), is_odd_y ? x : 1.0h);
+
+  // Now all the edge cases
+  if (__clc_isinf(ax) || x == 0.0h)
+    ret = __clc_copysign((x == 0.0h) ^ (ny < 0) ? 0.0h : __CLC_GENTYPE_INF,
+                         is_odd_y ? x : 0.0h);
+
+  if ((x < 0.0h && !is_odd_y) || ny == 0)
+    ret = __CLC_GENTYPE_NAN;
+
+  return ret;
+}
+
+#else
+#error missing function macro
+#endif
+
+#endif
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/generic/math/clc_pown.cl b/libclc/clc/lib/generic/math/clc_pown.cl
index 5aa9560174b99..4fe22c306918f 100644
--- a/libclc/clc/lib/generic/math/clc_pown.cl
+++ b/libclc/clc/lib/generic/math/clc_pown.cl
@@ -6,16 +6,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_pown.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isinf.h"
 
-#define __CLC_BODY <clc_pown.inc>
-#include <clc/math/gentype.inc>
+#define __CLC_COMPILING_POWN
+#define __CLC_BODY "clc_pow_base.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_FUNCTION __clc_pown
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_FUNCTION __clc_pown_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_pown.inc b/libclc/clc/lib/generic/math/clc_pown.inc
deleted file mode 100644
index 1a681b5e4b397..0000000000000
--- a/libclc/clc/lib/generic/math/clc_pown.inc
+++ /dev/null
@@ -1,402 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps.
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do:
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
-                                                __CLC_INTN ny) {
-  __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
-
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = __CLC_AS_INTN(absx);
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
-  __CLC_INTN ypos = iy == ay;
-
-  // Extra precise log calculation
-  // First handle case that x is close to 1
-  __CLC_GENTYPE r = 1.0f - absx;
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  // Computations for x not near 1
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  // F - Y
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-  __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
-  __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
-  __CLC_GENTYPE l = lh + lt;
-
-  // Select near 1 or not
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE yt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(yh));
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  // Extra precise exp of ylogx
-  // 64/log2 : 92.332482616893657
-  const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
-  // log2/64 lead: 0.0108032227
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  // log2/64 tail: 0.0000272020388
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  // Truncated Taylor series for e^r
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
-  __CLC_GENTYPE sexpylogx =
-      expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149));
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-  expylogx =
-      __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
-                   ylogx > 0x1.62e430p+6f ||
-                       (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
-  // Result is 0 if ylogx < -149*log2
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  // Classify y:
-  //   inty = 0 means not an integer.
-  //   inty = 1 means odd integer.
-  //   inty = 2 means even integer.
-
-  __CLC_INTN inty = 2 - (ny & 1);
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  // Corner case handling
-  __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
-
-  __CLC_INTN xinf =
-      xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
-  ret = ((ax == 0) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0) && !ypos && (inty == 2)) ? PINFBITPATT_SP32 : ret;
-  ret = ((ax == 0) && ypos && (inty == 2)) ? 0 : ret;
-  __CLC_INTN xzero = !xpos ? (__CLC_INTN)0x80000000 : (__CLC_INTN)0;
-  ret = ((ax == 0) && ypos && (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0 : ret;
-  ret = (x_is_ninf && ypos && (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
-  ret = (x_is_ninf && ypos && (inty != 1)) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
-  ret = ((ix == PINFBITPATT_SP32) && !ypos) ? 0 : ret;
-  ret = ((ix == PINFBITPATT_SP32) && ypos) ? (__CLC_INTN)PINFBITPATT_SP32 : ret;
-  ret = ax > PINFBITPATT_SP32 ? ix : ret;
-  ret = ny == 0 ? 0x3f800000 : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x,
-                                                __CLC_INTN ny) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_GENTYPE y = __CLC_CONVERT_GENTYPE(ny);
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == -1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_BIT_INTN mask_2_24 = ay > 0x4170000000000000; // 2^24
-    __CLC_INTN nyh = __CLC_CONVERT_INTN(y_head);
-    __CLC_INTN nyt = ny - nyh;
-    __CLC_GENTYPE y_tail1 = __CLC_CONVERT_GENTYPE(nyt);
-    y_tail = mask_2_24 ? y_tail1 : y_tail;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty = __CLC_CONVERT_LONGN(2 - (ny & 1));
-
-  expv *= ((inty == 1) && !xpos) ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
-  __CLC_LONGN xinf =
-      xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
-
-  ret = ((ax == 0L) && !ypos && (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0L) && !ypos && (inty == 2)) ? (__CLC_LONGN)PINFBITPATT_DP64
-                                             : ret;
-  ret = ((ax == 0L) && ypos && (inty == 2)) ? 0L : ret;
-  __CLC_LONGN xzero = !xpos ? (__CLC_LONGN)0x8000000000000000L : 0L;
-  ret = ((ax == 0L) && ypos && (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf && !ypos && (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
-                                            : ret;
-  ret = (x_is_ninf && !ypos && (inty != 1)) ? 0L : ret;
-  ret =
-      (x_is_ninf && ypos && (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
-  ret =
-      (x_is_ninf && ypos && (inty != 1)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = (x_is_pinf && !ypos) ? 0L : ret;
-  ret = (x_is_pinf && ypos) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = ax > (__CLC_LONGN)PINFBITPATT_DP64 ? ux : ret;
-  ret = __CLC_CONVERT_LONGN(ny == 0) ? (__CLC_LONGN)0x3ff0000000000000L : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_pown(__CLC_GENTYPE x, __CLC_INTN y) {
-  return __CLC_CONVERT_GENTYPE(__clc_pown(__CLC_CONVERT_FLOATN(x), y));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_powr.cl b/libclc/clc/lib/generic/math/clc_powr.cl
index 0556ec97d6f3c..fc153dfebf9a0 100644
--- a/libclc/clc/lib/generic/math/clc_powr.cl
+++ b/libclc/clc/lib/generic/math/clc_powr.cl
@@ -6,16 +6,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
-#include <clc/relational/clc_select.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/internal/clc.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/math/math.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
 
-#define __CLC_BODY <clc_powr.inc>
-#include <clc/math/gentype.inc>
+#define __CLC_COMPILING_POWR
+#define __CLC_BODY <clc_pow_base.inc>
+#include "clc/math/gentype.inc"
+
+#define __CLC_FUNCTION __clc_powr
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __clc_powr_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
diff --git a/libclc/clc/lib/generic/math/clc_powr.inc b/libclc/clc/lib/generic/math/clc_powr.inc
deleted file mode 100644
index b94dbfdcbdeb7..0000000000000
--- a/libclc/clc/lib/generic/math/clc_powr.inc
+++ /dev/null
@@ -1,414 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
-                                                __CLC_GENTYPE y) {
-  __CLC_GENTYPE absx = __clc_fabs(x);
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = __CLC_AS_INTN(absx);
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = __CLC_AS_INTN(__clc_fabs(y));
-  __CLC_INTN ypos = iy == ay;
-
-  // Extra precise log calculation
-  // First handle case that x is close to 1
-  __CLC_GENTYPE r = 1.0f - absx;
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  // Computations for x not near 1
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  // F - Y
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-  __CLC_GENTYPE logel = __CLC_USE_TABLE(loge_tbl_lo, indx);
-  __CLC_GENTYPE logeh = __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt = __clc_mad(mfn, LOG2_TAIL, -poly) + logeh;
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh = __clc_mad(mfn, LOG2_HEAD, logel);
-  __CLC_GENTYPE l = lh + lt;
-
-  // Select near 1 or not
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE yt = y - yh;
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  // Extra precise exp of ylogx
-  // 64/log2 : 92.332482616893657
-  const __CLC_GENTYPE R_64_BY_LOG2 = 0x1.715476p+6f;
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-  // log2/64 lead: 0.0108032227
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  // log2/64 tail: 0.0000272020388
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  // Truncated Taylor series for e^r
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exp_head = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE exp_tail = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exp_head, poly, __clc_mad(exp_tail, poly, exp_tail)) + exp_head;
-  __CLC_GENTYPE sexpylogx =
-      expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149));
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-  expylogx =
-      __clc_select(expylogx, __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32),
-                   (ylogx > 0x1.62e430p+6f) ||
-                       (ylogx == 0x1.62e430p+6f && ylogx_t > -0x1.05c610p-22f));
-
-  // Result is 0 if ylogx < -149*log2
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  // Classify y:
-  //   inty = 0 means not an integer.
-  //   inty = 1 means odd integer.
-  //   inty = 2 means even integer.
-
-  __CLC_INTN yexp = (__CLC_INTN)(ay >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32 + 1;
-  __CLC_INTN mask = ((__CLC_INTN)1 << (24 - yexp)) - 1;
-  __CLC_INTN yodd = ((iy >> (24 - yexp)) & 0x1) != 0;
-  __CLC_INTN inty = yodd ? 1 : 2;
-  inty = (iy & mask) != 0 ? 0 : inty;
-  inty = yexp < 1 ? 0 : inty;
-  inty = yexp > 24 ? 2 : inty;
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) && !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  // Corner case handling
-  __CLC_BIT_INTN y_is_ninf = iy == (__CLC_INTN)NINFBITPATT_SP32;
-  __CLC_BIT_INTN y_is_pinf = iy == (__CLC_INTN)PINFBITPATT_SP32;
-  __CLC_BIT_INTN x_is_inf = ax == (__CLC_INTN)PINFBITPATT_SP32;
-
-  ret = ax < 0x3f800000 && y_is_ninf ? PINFBITPATT_SP32 : ret;
-  ret = ax < 0x3f800000 && y_is_pinf ? 0 : ret;
-  ret = ax == 0x3f800000 && ay < PINFBITPATT_SP32 ? 0x3f800000 : ret;
-  ret = ax == 0x3f800000 && ay == PINFBITPATT_SP32 ? QNANBITPATT_SP32 : ret;
-  ret = ax > 0x3f800000 && y_is_ninf ? 0 : ret;
-  ret = ax > 0x3f800000 && y_is_pinf ? PINFBITPATT_SP32 : ret;
-  ret = ((ix < PINFBITPATT_SP32) && (ay == 0)) ? 0x3f800000 : ret;
-  ret = (x_is_inf && !ypos) ? 0 : ret;
-  ret = (x_is_inf && ypos) ? PINFBITPATT_SP32 : ret;
-  ret = (x_is_inf && y_is_pinf) ? PINFBITPATT_SP32 : ret;
-  ret = (x_is_inf && (ay == 0)) ? QNANBITPATT_SP32 : ret;
-  ret = ((ax == 0) && !ypos) ? PINFBITPATT_SP32 : ret;
-  ret = ((ax == 0) && ypos) ? 0 : ret;
-  ret = ((ax == 0) && (ay == 0)) ? QNANBITPATT_SP32 : ret;
-  ret = ((ax != 0) && !xpos) ? QNANBITPATT_SP32 : ret;
-  ret = ax > PINFBITPATT_SP32 ? ix : ret;
-  ret = ay > PINFBITPATT_SP32 ? iy : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
-                                                __CLC_GENTYPE y) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == (__CLC_INTN)-1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty;
-  {
-    __CLC_INTN yexp =
-        __CLC_CONVERT_INTN(ay >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 + 1;
-    inty = __CLC_CONVERT_LONGN(yexp < 1 ? 0 : 2);
-    inty = __CLC_CONVERT_LONGN(yexp > 53) ? 2 : inty;
-    __CLC_LONGN mask = ((__CLC_LONGN)1L << (53 - yexp)) - 1L;
-    __CLC_LONGN inty1 = (((ay & ~mask) >> (53 - yexp)) & 1L) == 1L ? 1L : 2L;
-    inty1 = (ay & mask) != 0 ? 0 : inty1;
-    inty = __CLC_CONVERT_LONGN(!(yexp < 1) && !(yexp > 53)) ? inty1 : inty;
-  }
-
-  expv *= ((inty == 1) && !xpos) ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN y_is_ninf = uy == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN y_is_pinf = uy == (__CLC_LONGN)PINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_inf = ax == (__CLC_LONGN)PINFBITPATT_DP64;
-
-  ret = ax < 0x3ff0000000000000L && y_is_ninf ? PINFBITPATT_DP64 : ret;
-  ret = ax < 0x3ff0000000000000L && y_is_pinf ? 0L : ret;
-  ret = ax == 0x3ff0000000000000L && ay < PINFBITPATT_DP64 ? 0x3ff0000000000000L
-                                                           : ret;
-  ret = ax == 0x3ff0000000000000L && ay == PINFBITPATT_DP64 ? QNANBITPATT_DP64
-                                                            : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_ninf ? 0L : ret;
-  ret = ax > 0x3ff0000000000000L && y_is_pinf ? PINFBITPATT_DP64 : ret;
-  ret = ux < PINFBITPATT_DP64 && ay == 0L ? 0x3ff0000000000000L : ret;
-  ret = (x_is_inf && !ypos) ? 0L : ret;
-  ret = (x_is_inf && ypos) ? PINFBITPATT_DP64 : ret;
-  ret = (x_is_inf && y_is_pinf) ? PINFBITPATT_DP64 : ret;
-  ret = ((ax == PINFBITPATT_DP64) && (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-  ret = ((ax == 0L) && !ypos) ? PINFBITPATT_DP64 : ret;
-  ret = ((ax == 0L) && ypos) ? 0L : ret;
-  ret = ((ax == 0L) && (ay == 0L)) ? QNANBITPATT_DP64 : ret;
-  ret = ((ax != 0L) && !xpos) ? QNANBITPATT_DP64 : ret;
-  ret = ax > PINFBITPATT_DP64 ? ux : ret;
-  ret = ay > PINFBITPATT_DP64 ? uy : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_powr(__CLC_GENTYPE x,
-                                                __CLC_GENTYPE y) {
-  return __CLC_CONVERT_GENTYPE(
-      __clc_powr(__CLC_CONVERT_FLOATN(x), __CLC_CONVERT_FLOATN(y)));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_rootn.cl b/libclc/clc/lib/generic/math/clc_rootn.cl
index da397cf66da62..8dac54f239cd5 100644
--- a/libclc/clc/lib/generic/math/clc_rootn.cl
+++ b/libclc/clc/lib/generic/math/clc_rootn.cl
@@ -6,16 +6,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/clc_convert.h>
-#include <clc/float/definitions.h>
-#include <clc/internal/clc.h>
-#include <clc/math/clc_fabs.h>
-#include <clc/math/clc_fma.h>
-#include <clc/math/clc_ldexp.h>
-#include <clc/math/clc_mad.h>
-#include <clc/math/clc_subnormal_config.h>
-#include <clc/math/math.h>
-#include <clc/math/tables.h>
+#include "clc/clc_convert.h"
+#include "clc/float/definitions.h"
+#include "clc/math/clc_copysign.h"
+#include "clc/math/clc_ep.h"
+#include "clc/math/clc_exp2.h"
+#include "clc/math/clc_exp2_fast.h"
+#include "clc/math/clc_fabs.h"
+#include "clc/math/clc_ldexp.h"
+#include "clc/math/clc_log2.h"
+#include "clc/math/clc_log2_fast.h"
+#include "clc/math/clc_mad.h"
+#include "clc/math/clc_recip_fast.h"
+#include "clc/math/clc_rootn.h"
+#include "clc/math/clc_trunc.h"
+#include "clc/relational/clc_isinf.h"
+#include "clc/relational/clc_isunordered.h"
 
-#define __CLC_BODY <clc_rootn.inc>
+#define __CLC_COMPILING_ROOTN
+#define __CLC_BODY <clc_pow_base.inc>
 #include <clc/math/gentype.inc>
+
+#define __CLC_FUNCTION __clc_rootn
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_ARG2_SCALAR_TYPE int
+#define __CLC_FUNCTION __clc_rootn_fast
+#define __CLC_BODY "clc/shared/binary_def_scalarize_loop.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/clc/lib/generic/math/clc_rootn.inc b/libclc/clc/lib/generic/math/clc_rootn.inc
deleted file mode 100644
index fd3d0becb0dff..0000000000000
--- a/libclc/clc/lib/generic/math/clc_rootn.inc
+++ /dev/null
@@ -1,405 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Computes pow using log and exp
-//
-//   x^y = exp(y * log(x))
-//
-// We take care not to lose precision in the intermediate steps.
-//
-// When computing log, calculate it in splits:
-//
-//   r = f * (p_invead + p_inv_tail)
-//   r = rh + rt
-//
-// Calculate log polynomial using r, in end addition, do:
-//
-//   poly = poly + ((rh-r) + rt)
-//
-//   lth = -r
-//   ltt = ((xexp * log2_t) - poly) + logT
-//   lt = lth + ltt
-//
-//   lh = (xexp * log2_h) + logH
-//   l = lh + lt
-//
-// Calculate final log answer as gh and gt:
-//
-//   gh = l & higher-half bits
-//   gt = (((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh))
-//
-//   yh = y & higher-half bits
-//   yt = y - yh
-//
-// Before entering computation of exp:
-//
-//   vs = ((yt*gt + yt*gh) + yh*gt)
-//   v = vs + yh*gh
-//   vt = ((yh*gh - v) + vs)
-//
-// In calculation of exp, add vt to r that is used for poly.
-//
-// At the end of exp, do:
-//
-//   ((((expT * poly) + expT) + expH*poly) + expH)
-//
-//===----------------------------------------------------------------------===//
-
-#if __CLC_FPSIZE == 32
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
-                                                 __CLC_INTN ny) {
-  __CLC_GENTYPE y = MATH_RECIP(__CLC_CONVERT_GENTYPE(ny));
-
-  __CLC_INTN ix = __CLC_AS_INTN(x);
-  __CLC_INTN ax = ix & EXSIGNBIT_SP32;
-  __CLC_INTN xpos = ix == ax;
-
-  __CLC_INTN iy = __CLC_AS_INTN(y);
-  __CLC_INTN ay = iy & EXSIGNBIT_SP32;
-  __CLC_INTN ypos = iy == ay;
-
-  // Extra precise log calculation
-  // First handle case that x is close to 1
-  __CLC_GENTYPE r = 1.0f - __CLC_AS_GENTYPE(ax);
-  __CLC_INTN near1 = __clc_fabs(r) < 0x1.0p-4f;
-  __CLC_GENTYPE r2 = r * r;
-
-  // Coefficients are just 1/3, 1/4, 1/5 and 1/6
-  __CLC_GENTYPE poly = __clc_mad(
-      r,
-      __clc_mad(r,
-                __clc_mad(r, __clc_mad(r, 0x1.24924ap-3f, 0x1.555556p-3f),
-                          0x1.99999ap-3f),
-                0x1.000000p-2f),
-      0x1.555556p-2f);
-
-  poly *= r2 * r;
-
-  __CLC_GENTYPE lth_near1 = -r2 * 0.5f;
-  __CLC_GENTYPE ltt_near1 = -poly;
-  __CLC_GENTYPE lt_near1 = lth_near1 + ltt_near1;
-  __CLC_GENTYPE lh_near1 = -r;
-  __CLC_GENTYPE l_near1 = lh_near1 + lt_near1;
-
-  // Computations for x not near 1
-  __CLC_INTN m = __CLC_CONVERT_INTN(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
-  __CLC_GENTYPE mf = __CLC_CONVERT_GENTYPE(m);
-  __CLC_INTN ixs = __CLC_AS_INTN(__CLC_AS_GENTYPE(ax | 0x3f800000) - 1.0f);
-  __CLC_GENTYPE mfs = __CLC_CONVERT_GENTYPE((ixs >> EXPSHIFTBITS_SP32) - 253);
-  __CLC_INTN c = m == -127;
-  __CLC_INTN ixn = c ? ixs : ax;
-  __CLC_GENTYPE mfn = c ? mfs : mf;
-
-  __CLC_INTN indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
-
-  // F - Y
-  __CLC_GENTYPE f = __CLC_AS_GENTYPE(0x3f000000 | indx) -
-                    __CLC_AS_GENTYPE(0x3f000000 | (ixn & MANTBITS_SP32));
-
-  indx = indx >> 16;
-  __CLC_GENTYPE rh = f * __CLC_USE_TABLE(log_inv_tbl_ep_head, indx);
-  __CLC_GENTYPE rt = f * __CLC_USE_TABLE(log_inv_tbl_ep_tail, indx);
-  ;
-  r = rh + rt;
-
-  poly = __clc_mad(r, __clc_mad(r, 0x1.0p-2f, 0x1.555556p-2f), 0x1.0p-1f) *
-         (r * r);
-  poly += (rh - r) + rt;
-
-  const __CLC_GENTYPE LOG2_HEAD = 0x1.62e000p-1f;  // 0.693115234
-  const __CLC_GENTYPE LOG2_TAIL = 0x1.0bfbe8p-15f; // 0.0000319461833
-  __CLC_GENTYPE lth = -r;
-  __CLC_GENTYPE ltt =
-      __clc_mad(mfn, LOG2_TAIL, -poly) + __CLC_USE_TABLE(loge_tbl_hi, indx);
-  __CLC_GENTYPE lt = lth + ltt;
-  __CLC_GENTYPE lh =
-      __clc_mad(mfn, LOG2_HEAD, __CLC_USE_TABLE(loge_tbl_lo, indx));
-  __CLC_GENTYPE l = lh + lt;
-
-  // Select near 1 or not
-  lth = near1 ? lth_near1 : lth;
-  ltt = near1 ? ltt_near1 : ltt;
-  lt = near1 ? lt_near1 : lt;
-  lh = near1 ? lh_near1 : lh;
-  l = near1 ? l_near1 : l;
-
-  __CLC_GENTYPE gh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(l) & 0xfffff000);
-  __CLC_GENTYPE gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
-
-  __CLC_GENTYPE yh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(iy) & 0xfffff000);
-
-  __CLC_GENTYPE fny = __CLC_CONVERT_GENTYPE(ny);
-  __CLC_GENTYPE fnyh = __CLC_AS_GENTYPE(__CLC_AS_UINTN(fny) & 0xfffff000);
-  __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh));
-  __CLC_GENTYPE yt =
-      MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny);
-
-  __CLC_GENTYPE ylogx_s = __clc_mad(gt, yh, __clc_mad(gh, yt, yt * gt));
-  __CLC_GENTYPE ylogx = __clc_mad(yh, gh, ylogx_s);
-  __CLC_GENTYPE ylogx_t = __clc_mad(yh, gh, -ylogx) + ylogx_s;
-
-  // Extra precise exp of ylogx
-  const __CLC_GENTYPE R_64_BY_LOG2 =
-      0x1.715476p+6f; // 64/log2 : 92.332482616893657
-  __CLC_INTN n = __CLC_CONVERT_INTN(ylogx * R_64_BY_LOG2);
-  __CLC_GENTYPE nf = __CLC_CONVERT_GENTYPE(n);
-
-  __CLC_INTN j = n & 0x3f;
-  m = n >> 6;
-  __CLC_INTN m2 = m << EXPSHIFTBITS_SP32;
-
-  // log2/64 lead: 0.0108032227
-  const __CLC_GENTYPE R_LOG2_BY_64_LD = 0x1.620000p-7f;
-  // log2/64 tail: 0.0000272020388
-  const __CLC_GENTYPE R_LOG2_BY_64_TL = 0x1.c85fdep-16f;
-  r = __clc_mad(nf, -R_LOG2_BY_64_TL, __clc_mad(nf, -R_LOG2_BY_64_LD, ylogx)) +
-      ylogx_t;
-
-  // Truncated Taylor series for e^r
-  poly = __clc_mad(__clc_mad(__clc_mad(r, 0x1.555556p-5f, 0x1.555556p-3f), r,
-                             0x1.000000p-1f),
-                   r * r, r);
-
-  __CLC_GENTYPE exph = __CLC_USE_TABLE(exp_tbl_ep_head, j);
-  __CLC_GENTYPE expt = __CLC_USE_TABLE(exp_tbl_ep_tail, j);
-
-  __CLC_GENTYPE expylogx =
-      __clc_mad(exph, poly, __clc_mad(expt, poly, expt)) + exph;
-  __CLC_GENTYPE sexpylogx =
-      __clc_fp32_subnormals_supported()
-          ? expylogx * __CLC_AS_GENTYPE((__CLC_INTN)0x1 << (m + 149))
-          : 0.0f;
-
-  __CLC_GENTYPE texpylogx = __CLC_AS_GENTYPE(__CLC_AS_INTN(expylogx) + m2);
-  expylogx = m < -125 ? sexpylogx : texpylogx;
-
-  // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
-  expylogx = ((ylogx > 0x1.62e430p+6f) |
-              (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
-                 ? __CLC_AS_GENTYPE((__CLC_UINTN)PINFBITPATT_SP32)
-                 : expylogx;
-
-  // Result is 0 if ylogx < -149*log2
-  expylogx = ylogx < -0x1.9d1da0p+6f ? 0.0f : expylogx;
-
-  // Classify y:
-  //   inty = 0 means not an integer.
-  //   inty = 1 means odd integer.
-  //   inty = 2 means even integer.
-
-  __CLC_INTN inty = 2 - (ny & 1);
-
-  __CLC_GENTYPE signval =
-      __CLC_AS_GENTYPE((__CLC_AS_UINTN(expylogx) ^ SIGNBIT_SP32));
-  expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-  __CLC_INTN ret = __CLC_AS_INTN(expylogx);
-
-  // Corner case handling
-  __CLC_BIT_INTN x_is_ninf = ix == (__CLC_INTN)NINFBITPATT_SP32;
-  __CLC_BIT_INTN x_is_pinf = ix == (__CLC_INTN)PINFBITPATT_SP32;
-
-  ret = (!xpos & (inty == 2)) ? __CLC_AS_INTN(__CLC_GENTYPE_NAN) : ret;
-  __CLC_INTN xinf =
-      xpos ? (__CLC_INTN)PINFBITPATT_SP32 : (__CLC_INTN)NINFBITPATT_SP32;
-  ret = ((ax == 0) & !ypos & (inty == 1)) ? xinf : ret;
-  ret = ((ax == 0) & !ypos & (inty == 2)) ? PINFBITPATT_SP32 : ret;
-  ret = ((ax == 0) & ypos & (inty == 2)) ? 0 : ret;
-  __CLC_INTN xzero = xpos ? 0 : (__CLC_INTN)0x80000000;
-  ret = ((ax == 0) & ypos & (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_INTN)NINFBITPATT_SP32 : ret;
-  ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_INTN)0x80000000 : ret;
-  ret = (x_is_pinf & !ypos) ? 0 : ret;
-  ret = (x_is_pinf & ypos) ? PINFBITPATT_SP32 : ret;
-  ret = ax > PINFBITPATT_SP32 ? ix : ret;
-  ret = ny == 0 ? __CLC_AS_INTN(__CLC_GENTYPE_NAN) : ret;
-
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 64
-
-_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
-                                                 __CLC_INTN ny) {
-  const __CLC_GENTYPE real_log2_tail = 5.76999904754328540596e-08;
-  const __CLC_GENTYPE real_log2_lead = 6.93147122859954833984e-01;
-
-  __CLC_GENTYPE dny = __CLC_CONVERT_GENTYPE(ny);
-  __CLC_GENTYPE y = 1.0 / dny;
-
-  __CLC_LONGN ux = __CLC_AS_LONGN(x);
-  __CLC_LONGN ax = __CLC_AS_LONGN(__clc_fabs(x));
-  __CLC_BIT_INTN xpos = ax == ux;
-
-  __CLC_LONGN uy = __CLC_AS_LONGN(y);
-  __CLC_LONGN ay = __CLC_AS_LONGN(__clc_fabs(y));
-  __CLC_BIT_INTN ypos = ay == uy;
-
-  // Extended precision log
-  __CLC_GENTYPE v, vt;
-  {
-    __CLC_INTN exp = __CLC_CONVERT_INTN(ax >> 52) - 1023;
-    __CLC_INTN mask_exp_1023 = exp == -1023;
-    __CLC_GENTYPE xexp = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa = ax & 0x000FFFFFFFFFFFFFL;
-
-    __CLC_LONGN temp_ux =
-        __CLC_AS_LONGN(__CLC_AS_GENTYPE(0x3ff0000000000000L | mantissa) - 1.0);
-    exp = __CLC_CONVERT_INTN((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
-    __CLC_GENTYPE xexp1 = __CLC_CONVERT_GENTYPE(exp);
-    __CLC_LONGN mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
-
-    xexp = __CLC_CONVERT_LONGN(mask_exp_1023) ? xexp1 : xexp;
-    mantissa = __CLC_CONVERT_LONGN(mask_exp_1023) ? mantissa1 : mantissa;
-
-    __CLC_LONGN rax = (mantissa & 0x000ff00000000000) +
-                      ((mantissa & 0x0000080000000000) << 1);
-    __CLC_INTN index = __CLC_CONVERT_INTN(rax >> 44);
-
-    __CLC_GENTYPE F = __CLC_AS_GENTYPE(rax | 0x3FE0000000000000L);
-    __CLC_GENTYPE Y = __CLC_AS_GENTYPE(mantissa | 0x3FE0000000000000L);
-    __CLC_GENTYPE f = F - Y;
-    __CLC_GENTYPE log_h = __CLC_USE_TABLE(log_f_inv_tbl_head, index);
-    __CLC_GENTYPE log_t = __CLC_USE_TABLE(log_f_inv_tbl_tail, index);
-    __CLC_GENTYPE f_inv = (log_h + log_t) * f;
-    __CLC_GENTYPE r1 =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(f_inv) & 0xfffffffff8000000L);
-    __CLC_GENTYPE r2 = __clc_fma(-F, r1, f) * (log_h + log_t);
-    __CLC_GENTYPE r = r1 + r2;
-
-    __CLC_GENTYPE poly = __clc_fma(
-        r,
-        __clc_fma(r,
-                  __clc_fma(r, __clc_fma(r, 1.0 / 7.0, 1.0 / 6.0), 1.0 / 5.0),
-                  1.0 / 4.0),
-        1.0 / 3.0);
-    poly = poly * r * r * r;
-
-    __CLC_GENTYPE hr1r1 = 0.5 * r1 * r1;
-    __CLC_GENTYPE poly0h = r1 + hr1r1;
-    __CLC_GENTYPE poly0t = r1 - poly0h + hr1r1;
-    poly = __clc_fma(r1, r2, __clc_fma(0.5 * r2, r2, poly)) + r2 + poly0t;
-
-    log_h = __CLC_USE_TABLE(powlog_tbl_head, index);
-    log_t = __CLC_USE_TABLE(powlog_tbl_tail, index);
-
-    __CLC_GENTYPE resT_t = __clc_fma(xexp, real_log2_tail, +log_t) - poly;
-    __CLC_GENTYPE resT = resT_t - poly0h;
-    __CLC_GENTYPE resH = __clc_fma(xexp, real_log2_lead, log_h);
-    __CLC_GENTYPE resT_h = poly0h;
-
-    __CLC_GENTYPE H = resT + resH;
-    __CLC_GENTYPE H_h =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(H) & 0xfffffffff8000000L);
-    __CLC_GENTYPE T =
-        (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
-    H = H_h;
-
-    __CLC_GENTYPE y_head =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(uy) & 0xfffffffff8000000L);
-    __CLC_GENTYPE y_tail = y - y_head;
-
-    __CLC_GENTYPE fnyh =
-        __CLC_AS_GENTYPE(__CLC_AS_ULONGN(dny) & 0xfffffffffff00000);
-    __CLC_GENTYPE fnyt = __CLC_CONVERT_GENTYPE(ny - __CLC_CONVERT_INTN(fnyh));
-    y_tail = __clc_fma(-fnyt, y_head, __clc_fma(-fnyh, y_head, 1.0)) / dny;
-
-    __CLC_GENTYPE temp = __clc_fma(y_tail, H, __clc_fma(y_head, T, y_tail * T));
-    v = __clc_fma(y_head, H, temp);
-    vt = __clc_fma(y_head, H, -v) + temp;
-  }
-
-  // Now calculate exp of (v,vt)
-
-  __CLC_GENTYPE expv;
-  {
-    const __CLC_GENTYPE max_exp_arg = 709.782712893384;
-    const __CLC_GENTYPE min_exp_arg = -745.1332191019411;
-    const __CLC_GENTYPE sixtyfour_by_lnof2 = 92.33248261689366;
-    const __CLC_GENTYPE lnof2_by_64_head = 0.010830424260348081;
-    const __CLC_GENTYPE lnof2_by_64_tail = -4.359010638708991e-10;
-
-    // If v is so large that we need to return INFINITY, or so small that we
-    // need to return 0, set v to known values that will produce that result. Do
-    // not try to continue the computation with the original v and patch it up
-    // afterwards because v may be so large that temp is out of range of int, in
-    // which case that conversion, and a value based on that conversion being
-    // passed to __clc_ldexp, results in undefined behavior.
-    v = v > max_exp_arg ? 1000.0 : v;
-    v = v < min_exp_arg ? -1000.0 : v;
-
-    __CLC_GENTYPE temp = v * sixtyfour_by_lnof2;
-    __CLC_INTN n = __CLC_CONVERT_INTN(temp);
-    __CLC_GENTYPE dn = __CLC_CONVERT_GENTYPE(n);
-    __CLC_INTN j = n & 0x0000003f;
-    __CLC_INTN m = n >> 6;
-
-    __CLC_GENTYPE f1 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_head, j);
-    __CLC_GENTYPE f2 = __CLC_USE_TABLE(two_to_jby64_ep_tbl_tail, j);
-    __CLC_GENTYPE f = f1 + f2;
-
-    __CLC_GENTYPE r1 = __clc_fma(dn, -lnof2_by_64_head, v);
-    __CLC_GENTYPE r2 = dn * lnof2_by_64_tail;
-    __CLC_GENTYPE r = (r1 + r2) + vt;
-
-    __CLC_GENTYPE q =
-        __clc_fma(r,
-                  __clc_fma(r,
-                            __clc_fma(r,
-                                      __clc_fma(r, 1.38889490863777199667e-03,
-                                                8.33336798434219616221e-03),
-                                      4.16666666662260795726e-02),
-                            1.66666666665260878863e-01),
-                  5.00000000000000008883e-01);
-    q = __clc_fma(r * r, q, r);
-
-    expv = __clc_fma(f, q, f2) + f1;
-    expv = __clc_ldexp(expv, m);
-  }
-
-  // See whether y is an integer.
-  // inty = 0 means not an integer.
-  // inty = 1 means odd integer.
-  // inty = 2 means even integer.
-
-  __CLC_LONGN inty = __CLC_CONVERT_LONGN(2 - (ny & 1));
-
-  expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
-
-  __CLC_LONGN ret = __CLC_AS_LONGN(expv);
-
-  // Now all the edge cases
-  __CLC_BIT_INTN x_is_ninf = ux == (__CLC_LONGN)NINFBITPATT_DP64;
-  __CLC_BIT_INTN x_is_pinf = ux == (__CLC_LONGN)PINFBITPATT_DP64;
-  ret = (!xpos & (inty == 2)) ? __CLC_AS_LONGN(__CLC_GENTYPE_NAN) : ret;
-  __CLC_LONGN xinf =
-      xpos ? (__CLC_LONGN)PINFBITPATT_DP64 : (__CLC_LONGN)NINFBITPATT_DP64;
-  ret = ((ax == 0L) & !ypos & (inty == 1)) ? xinf : ret;
-  ret =
-      ((ax == 0L) & !ypos & (inty == 2)) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = ((ax == 0L) & ypos & (inty == 2)) ? 0L : ret;
-  __CLC_LONGN xzero = xpos ? 0L : (__CLC_LONGN)0x8000000000000000L;
-  ret = ((ax == 0L) & ypos & (inty == 1)) ? xzero : ret;
-  ret = (x_is_ninf & ypos & (inty == 1)) ? (__CLC_LONGN)NINFBITPATT_DP64 : ret;
-  ret = (x_is_ninf & !ypos & (inty == 1)) ? (__CLC_LONGN)0x8000000000000000L
-                                          : ret;
-  ret = (x_is_pinf & !ypos) ? 0L : ret;
-  ret = (x_is_pinf & ypos) ? (__CLC_LONGN)PINFBITPATT_DP64 : ret;
-  ret = ax > (__CLC_LONGN)PINFBITPATT_DP64 ? ux : ret;
-  ret = __CLC_CONVERT_LONGN(ny == 0) ? __CLC_AS_LONGN(__CLC_GENTYPE_NAN) : ret;
-  return __CLC_AS_GENTYPE(ret);
-}
-
-#elif __CLC_FPSIZE == 16
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_rootn(__CLC_GENTYPE x,
-                                                 __CLC_INTN y) {
-  return __CLC_CONVERT_GENTYPE(__clc_rootn(__CLC_CONVERT_FLOATN(x), y));
-}
-
-#endif
diff --git a/libclc/clc/lib/generic/math/clc_sin.inc b/libclc/clc/lib/generic/math/clc_sin.inc
index b4f72eb625eb0..973833ba0c2fc 100644
--- a/libclc/clc/lib/generic/math/clc_sin.inc
+++ b/libclc/clc/lib/generic/math/clc_sin.inc
@@ -9,6 +9,8 @@
 #if __CLC_FPSIZE == 32
 
 _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_sin(__CLC_FLOATN x) {
+  x = __clc_isinf(x) ? __CLC_GENTYPE_NAN : x;
+
   __CLC_FLOATN absx = __clc_fabs(x);
 
   __CLC_FLOATN r0, r1;
@@ -18,15 +20,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_FLOATN __clc_sin(__CLC_FLOATN x) {
   __CLC_FLOATN cc = __clc_cosf_piby4(r0, r1);
 
   __CLC_FLOATN s = (regn & 1) != 0 ? cc : ss;
-  s = __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
-                      (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
-
-  s = __clc_select(s, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
-
-  // Subnormals
-  s = x == 0.0f ? x : s;
-
-  return s;
+  return __CLC_AS_FLOATN(__CLC_AS_INTN(s) ^ ((regn > 1) << 31) ^
+                         (__CLC_AS_INTN(x) ^ __CLC_AS_INTN(absx)));
 }
 
 #elif __CLC_FPSIZE == 16
@@ -38,6 +33,8 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
 #elif __CLC_FPSIZE == 64
 
 _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
+  x = __clc_isinf(x) ? __CLC_GENTYPE_NAN : x;
+
   __CLC_GENTYPE absx = __clc_fabs(x);
 
   __CLC_BIT_INTN is_medium = absx < 0x1.0p+47;
@@ -61,8 +58,7 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_sin(__CLC_GENTYPE x) {
   s ^= (__CLC_CONVERT_BIT_INTN(regn > 1) << 63) ^
        (__CLC_CONVERT_BIT_INTN(x < 0.0) << 63);
 
-  return __clc_isinf(x) | __clc_isnan(x) ? __CLC_GENTYPE_NAN
-                                         : __CLC_AS_GENTYPE(s);
+  return __CLC_AS_GENTYPE(s);
 }
 
 #endif
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
index 19705c42f6f07..675f48059514c 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.cl
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clc/math/clc_frexp.h"
+#include "clc/math/clc_ldexp.h"
 #include <clc/clc_convert.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/integer/clc_mul_hi.h>
@@ -16,16 +18,7 @@
 #include <clc/math/clc_trunc.h>
 #include <clc/math/math.h>
 
-#define bitalign(hi, lo, shift) ((hi) << (32 - (shift))) | ((lo) >> (shift));
-
-#define __CLC_FULL_MUL(A, B, HI, LO)                                           \
-  LO = A * B;                                                                  \
-  HI = __clc_mul_hi(A, B)
-
-#define __CLC_FULL_MAD(A, B, C, HI, LO)                                        \
-  LO = ((A) * (B) + (C));                                                      \
-  HI = __clc_mul_hi(A, B);                                                     \
-  HI += LO < C ? 1U : 0U;
+#define bitalign(hi, lo, shift) __builtin_elementwise_fshr(hi, lo, shift)
 
 #define __CLC_FLOAT_ONLY
 #define __CLC_BODY <clc_sincos_helpers.inc>
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
index 2a71b5626ccc5..d484766f61ffe 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers.inc
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#pragma OPENCL FP_CONTRACT OFF
+
 _CLC_DEF _CLC_OVERLOAD __CLC_FLOATN __clc_sinf_piby4(__CLC_FLOATN x,
                                                      __CLC_FLOATN y) {
   // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
@@ -197,8 +199,11 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionSmallS(
 
 _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
     private __CLC_FLOATN *r, private __CLC_FLOATN *rr, __CLC_FLOATN x) {
-  __CLC_INTN xe = __CLC_AS_INTN((__CLC_AS_UINTN(x) >> 23) - 127);
-  __CLC_UINTN xm = 0x00800000U | (__CLC_AS_UINTN(x) & 0x7fffffU);
+  __CLC_INTN xe;
+  __CLC_FLOATN m = __clc_frexp(x, &xe);
+  --xe;
+
+  __CLC_UINTN xm = __CLC_CONVERT_UINTN(__clc_ldexp(m, 24));
 
   // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041
   // FE5163AB
@@ -210,15 +215,38 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
   const __CLC_UINTN b1 = 0x3C439041U;
   const __CLC_UINTN b0 = 0xFE5163ABU;
 
-  __CLC_UINTN p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+  __CLC_UINTN p0, p1, p2, p3, p4, p5, p6, p7;
+  __CLC_ULONGN a;
+
+  __CLC_ULONGN xm_u64 = __CLC_CONVERT_ULONGN(xm);
+
+  a = xm_u64 * __CLC_CONVERT_ULONGN(b0);
+  p0 = __CLC_CONVERT_UINTN(a);
+  a >>= 32;
+
+  a = xm_u64 * __CLC_CONVERT_ULONGN(b1) + a;
+  p1 = __CLC_CONVERT_UINTN(a);
+  a >>= 32;
+
+  a = xm_u64 * __CLC_CONVERT_ULONGN(b2) + a;
+  p2 = __CLC_CONVERT_UINTN(a);
+  a >>= 32;
 
-  __CLC_FULL_MUL(xm, b0, c0, p0);
-  __CLC_FULL_MAD(xm, b1, c0, c1, p1);
-  __CLC_FULL_MAD(xm, b2, c1, c0, p2);
-  __CLC_FULL_MAD(xm, b3, c0, c1, p3);
-  __CLC_FULL_MAD(xm, b4, c1, c0, p4);
-  __CLC_FULL_MAD(xm, b5, c0, c1, p5);
-  __CLC_FULL_MAD(xm, b6, c1, p7, p6);
+  a = xm_u64 * __CLC_CONVERT_ULONGN(b3) + a;
+  p3 = __CLC_CONVERT_UINTN(a);
+  a >>= 32;
+
+  a = xm_u64 * __CLC_CONVERT_ULONGN(b4) + a;
+  p4 = __CLC_CONVERT_UINTN(a);
+  a >>= 32;
+
+  a = xm_u64 * __CLC_CONVERT_ULONGN(b5) + a;
+  p5 = __CLC_CONVERT_UINTN(a);
+  a >>= 32;
+
+  a = xm_u64 * __CLC_CONVERT_ULONGN(b6) + a;
+  p6 = __CLC_CONVERT_UINTN(a);
+  p7 = __CLC_CONVERT_UINTN(a >> 32);
 
   __CLC_UINTN fbits = (__CLC_UINTN)224 + (__CLC_UINTN)23 - __CLC_AS_UINTN(xe);
 
@@ -227,24 +255,14 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
   __CLC_UINTN shift = 256U - 2 - fbits;
 
   // Shift by up to 134/32 = 4 words
-  __CLC_INTN c = shift > 31;
-  p7 = c ? p6 : p7;
-  p6 = c ? p5 : p6;
-  p5 = c ? p4 : p5;
-  p4 = c ? p3 : p4;
-  p3 = c ? p2 : p3;
-  p2 = c ? p1 : p2;
-  p1 = c ? p0 : p1;
-  shift -= (c ? 32U : 0U);
-
-  c = shift > 31;
-  p7 = c ? p6 : p7;
-  p6 = c ? p5 : p6;
-  p5 = c ? p4 : p5;
-  p4 = c ? p3 : p4;
-  p3 = c ? p2 : p3;
-  p2 = c ? p1 : p2;
-  shift -= (c ? 32U : 0U);
+  __CLC_INTN c = shift > 63;
+  p7 = c ? p5 : p7;
+  p6 = c ? p4 : p6;
+  p5 = c ? p3 : p5;
+  p4 = c ? p2 : p4;
+  p3 = c ? p1 : p3;
+  p2 = c ? p0 : p2;
+  shift -= __CLC_CONVERT_UINTN((-c) & (__CLC_INTN)64);
 
   c = shift > 31;
   p7 = c ? p6 : p7;
@@ -252,14 +270,14 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
   p5 = c ? p4 : p5;
   p4 = c ? p3 : p4;
   p3 = c ? p2 : p3;
-  shift -= (c ? 32U : 0U);
+  shift -= __CLC_CONVERT_UINTN((-c) & (__CLC_INTN)32);
 
   c = shift > 31;
   p7 = c ? p6 : p7;
   p6 = c ? p5 : p6;
   p5 = c ? p4 : p5;
   p4 = c ? p3 : p4;
-  shift -= (c ? 32U : 0U);
+  shift -= __CLC_CONVERT_UINTN((-c) & (__CLC_INTN)32);
 
   // bitalign cannot handle a shift of 32
   c = shift > 0;
@@ -275,9 +293,9 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
   __CLC_INTN i = __CLC_AS_INTN(p7 >> 29U);
 
   // Scoot up 2 more bits so only fraction remains
-  p7 = bitalign(p7, p6, 30);
-  p6 = bitalign(p6, p5, 30);
-  p5 = bitalign(p5, p4, 30);
+  p7 = bitalign(p7, p6, (__CLC_UINTN)30u);
+  p6 = bitalign(p6, p5, (__CLC_UINTN)30u);
+  p5 = bitalign(p5, p4, (__CLC_UINTN)30u);
 
   // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
   __CLC_UINTN flip = (i & 1) != 0 ? 0xFFFFFFFFU : 0U;
@@ -297,12 +315,12 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
       __CLC_AS_FLOATN(sign | ((127U - __CLC_AS_UINTN(xe)) << 23U) | p7 >> 9);
 
   // Shift out bits we captured on q1
-  p7 = bitalign(p7, p6, 32 - 23);
+  p7 = bitalign(p7, p6, (__CLC_UINTN)(32u - 23u));
 
   // Get 24 more bits of fraction in another float, there are not long strings
   // of zeroes here
   __CLC_INTN xxe = __CLC_AS_INTN(__clc_clz(p7)) + 1;
-  p7 = bitalign(p7, p6, 32 - xxe);
+  p7 = bitalign(p7, p6, __CLC_CONVERT_UINTN(32 - xxe));
   __CLC_FLOATN q0 = __CLC_AS_FLOATN(
       sign | ((127U - __CLC_AS_UINTN(xe + 23 + xxe)) << 23U) | p7 >> 9);
 
@@ -343,18 +361,17 @@ _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionLargeS(
 _CLC_DEF _CLC_OVERLOAD __CLC_INTN __clc_argReductionS(private __CLC_FLOATN *r,
                                                       private __CLC_FLOATN *rr,
                                                       __CLC_FLOATN x) {
-  __CLC_INTN is_small = x < (__CLC_FLOATN)0x1.0p+23f;
+  __CLC_INTN is_large = x >= (__CLC_FLOATN)0x1.0p+23f;
 #ifdef __CLC_SCALAR
-  if (is_small)
-    return __clc_argReductionSmallS(r, rr, x);
-  else
+  if (is_large)
     return __clc_argReductionLargeS(r, rr, x);
+  return __clc_argReductionSmallS(r, rr, x);
 #else
   __CLC_FLOATN r1, rr1, r2, rr2;
   __CLC_INTN ret1 = __clc_argReductionSmallS(&r1, &rr1, x);
   __CLC_INTN ret2 = __clc_argReductionLargeS(&r2, &rr2, x);
-  *r = is_small ? r1 : r2;
-  *rr = is_small ? rr1 : rr2;
-  return is_small ? ret1 : ret2;
+  *r = is_large ? r2 : r1;
+  *rr = is_large ? rr2 : rr1;
+  return is_large ? ret2 : ret1;
 #endif
 }
diff --git a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
index f8c089edc8838..ae97b7963f7b3 100644
--- a/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
+++ b/libclc/clc/lib/generic/math/clc_sincos_helpers_fp64.inc
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#pragma OPENCL FP_CONTRACT OFF
+
 _CLC_DEF _CLC_OVERLOAD void __clc_sincos_piby4(__CLC_DOUBLEN x,
                                                __CLC_DOUBLEN xx,
                                                private __CLC_DOUBLEN *sinval,
diff --git a/libclc/clc/lib/generic/math/clc_tan.inc b/libclc/clc/lib/generic/math/clc_tan.inc
index 8a318a53a34ba..f1c3c0de11bc3 100644
--- a/libclc/clc/lib/generic/math/clc_tan.inc
+++ b/libclc/clc/lib/generic/math/clc_tan.inc
@@ -9,6 +9,8 @@
 #if __CLC_FPSIZE == 32
 
 _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan(__CLC_GENTYPE x) {
+  x = __clc_isinf(x) ? __CLC_GENTYPE_NAN : x;
+
   __CLC_GENTYPE absx = __clc_fabs(x);
   __CLC_UINTN x_signbit = __CLC_AS_UINTN(x) & SIGNBIT_SP32;
 
@@ -16,17 +18,14 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan(__CLC_GENTYPE x) {
   __CLC_INTN regn = __clc_argReductionS(&r0, &r1, absx);
 
   __CLC_GENTYPE t = __clc_tanf_piby4(r0 + r1, regn);
-  t = __CLC_AS_GENTYPE(__CLC_AS_UINTN(t) ^ x_signbit);
-
-  t = __clc_select(t, __CLC_GENTYPE_NAN, __clc_isnan(x) || __clc_isinf(x));
-  // Take care of subnormals
-  t = (x == 0.0f) ? x : t;
-  return t;
+  return __CLC_AS_GENTYPE(__CLC_AS_UINTN(t) ^ x_signbit);
 }
 
 #elif __CLC_FPSIZE == 64
 
 _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan(__CLC_GENTYPE x) {
+  x = __clc_isinf(x) ? __CLC_GENTYPE_NAN : x;
+
   __CLC_GENTYPE y = __clc_fabs(x);
 
   __CLC_BIT_INTN is_medium = y < 0x1.0p+30;
@@ -48,8 +47,7 @@ _CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __clc_tan(__CLC_GENTYPE x) {
       __CLC_AS_LONGN(__CLC_CONVERT_BIT_INTN((regn & 1) != 0) ? tail : lead);
   t ^= __CLC_CONVERT_BIT_INTN(x < 0.0) << 63;
 
-  return __clc_isnan(x) || __clc_isinf(x) ? __CLC_GENTYPE_NAN
-                                          : __CLC_AS_GENTYPE(t);
+  return __CLC_AS_GENTYPE(t);
 }
 
 #elif __CLC_FPSIZE == 16
diff --git a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl b/libclc/clc/lib/generic/mem_fence/clc_mem_fence.cl
similarity index 83%
rename from libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
rename to libclc/clc/lib/generic/mem_fence/clc_mem_fence.cl
index afc72e831cd99..5c5185292184c 100644
--- a/libclc/clc/lib/ptx-nvidiacl/mem_fence/clc_mem_fence.cl
+++ b/libclc/clc/lib/generic/mem_fence/clc_mem_fence.cl
@@ -11,8 +11,6 @@
 _CLC_OVERLOAD _CLC_DEF void
 __clc_mem_fence(int memory_scope, int memory_order,
                 __CLC_MemorySemantics memory_semantics) {
-  (void)memory_order;
   (void)memory_semantics;
-  if (memory_scope & (__MEMORY_SCOPE_DEVICE | __MEMORY_SCOPE_WRKGRP))
-    __nvvm_membar_cta();
+  __scoped_atomic_thread_fence(memory_scope, memory_order);
 }
diff --git a/libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt b/libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt
index f345007e852e2..6eb0baab1c0bb 100644
--- a/libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt
+++ b/libclc/clc/lib/ptx-nvidiacl/CMakeLists.txt
@@ -4,7 +4,6 @@ libclc_configure_source_list(CLC_PTX_NVIDIACL_SOURCES
   math/clc_rsqrt.cl
   math/clc_sinpi.cl
   math/clc_sqrt.cl
-  mem_fence/clc_mem_fence.cl
   relational/clc_isinf.cl
   synchronization/clc_work_group_barrier.cl
   workitem/clc_get_global_id.cl
diff --git a/libclc/cmake/modules/CMakeCLCInformation.cmake b/libclc/cmake/modules/CMakeCLCInformation.cmake
index f92592221f034..3fb67d91dd1e6 100644
--- a/libclc/cmake/modules/CMakeCLCInformation.cmake
+++ b/libclc/cmake/modules/CMakeCLCInformation.cmake
@@ -11,14 +11,32 @@ if(NOT CMAKE_CLC_COMPILE_OBJECT)
     "<CMAKE_CLC_COMPILER> -x cl <DEFINES> <INCLUDES> <FLAGS> -c -o <OBJECT> <SOURCE>")
 endif()
 
+# Finds a required LLVM tool by searching the CLC compiler directory first.
+function(find_llvm_tool name out_var)
+  cmake_path(GET CMAKE_CLC_COMPILER PARENT_PATH llvm_bin_dir)
+  find_program(${out_var}
+    NAMES ${name}
+    HINTS "${llvm_bin_dir}"
+    DOC "libclc: path to the ${name} tool"
+  )
+  if(NOT ${out_var})
+    message(FATAL_ERROR "${name} not found for libclc build.")
+  endif()
+endfunction()
+
+find_llvm_tool(llvm-ar CLC_AR)
+find_llvm_tool(llvm-ranlib CLC_RANLIB)
+
 if(NOT DEFINED CMAKE_CLC_ARCHIVE_CREATE)
-  set(CMAKE_CLC_ARCHIVE_CREATE "<CMAKE_AR> qc <TARGET> <LINK_FLAGS> <OBJECTS>")
+  set(CMAKE_CLC_ARCHIVE_CREATE "${CLC_AR} qc <TARGET> <OBJECTS>")
 endif()
+
 if(NOT DEFINED CMAKE_CLC_ARCHIVE_APPEND)
-  set(CMAKE_CLC_ARCHIVE_APPEND "<CMAKE_AR> q <TARGET> <LINK_FLAGS> <OBJECTS>")
+  set(CMAKE_CLC_ARCHIVE_APPEND "${CLC_AR} q <TARGET> <OBJECTS>")
 endif()
+
 if(NOT DEFINED CMAKE_CLC_ARCHIVE_FINISH)
-  set(CMAKE_CLC_ARCHIVE_FINISH "<CMAKE_RANLIB> <TARGET>")
+  set(CMAKE_CLC_ARCHIVE_FINISH "${CLC_RANLIB} <TARGET>")
 endif()
 
 set(CMAKE_CLC_USE_LINKER_INFORMATION FALSE)
diff --git a/libclc/cmake/modules/CMakeDetermineCLCCompiler.cmake b/libclc/cmake/modules/CMakeDetermineCLCCompiler.cmake
index 2138ad85d0059..0ff60eb1b1144 100644
--- a/libclc/cmake/modules/CMakeDetermineCLCCompiler.cmake
+++ b/libclc/cmake/modules/CMakeDetermineCLCCompiler.cmake
@@ -4,7 +4,22 @@ if(NOT CMAKE_CLC_COMPILER)
       "The CLC language requires the C compiler (CMAKE_C_COMPILER) to be "
       "Clang, but CMAKE_C_COMPILER_ID is '${CMAKE_C_COMPILER_ID}'.")
   endif()
-  set(CMAKE_CLC_COMPILER "${CMAKE_C_COMPILER}" CACHE FILEPATH "CLC compiler")
+
+  # Use the regular clang driver if the C compiler is clang-cl.
+  if(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_C_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
+    cmake_path(GET CMAKE_C_COMPILER PARENT_PATH llvm_bin_dir)
+    find_program(clang_exe clang
+      HINTS "${llvm_bin_dir}"
+      NO_DEFAULT_PATH
+    )
+    if(NOT clang_exe)
+      message(FATAL_ERROR "clang-cl detected, but clang not found in ${llvm_bin_dir}")
+    endif()
+    set(clc_compiler "${clang_exe}")
+  else()
+    set(clc_compiler "${CMAKE_C_COMPILER}")
+  endif()
+  set(CMAKE_CLC_COMPILER "${clc_compiler}" CACHE FILEPATH "libclc: CLC compiler")
 endif()
 
 mark_as_advanced(CMAKE_CLC_COMPILER)
diff --git a/libclc/cmake/modules/CMakeTestCLCCompiler.cmake b/libclc/cmake/modules/CMakeTestCLCCompiler.cmake
index d18a818f93bfb..b6b6a33aad19d 100644
--- a/libclc/cmake/modules/CMakeTestCLCCompiler.cmake
+++ b/libclc/cmake/modules/CMakeTestCLCCompiler.cmake
@@ -15,8 +15,8 @@ file(MAKE_DIRECTORY "${_test_dir}")
 message(STATUS "Check for working CLC compiler: ${CMAKE_CLC_COMPILER}")
 
 execute_process(
-  COMMAND "${CMAKE_CLC_COMPILER}" -x cl -c -flto
-          -o "${_test_out}" "${_test_file}"
+  COMMAND "${CMAKE_CLC_COMPILER}" --target=spirv64-unknown-unknown -x cl -c -flto
+          -disable-llvm-passes -o "${_test_out}" "${_test_file}"
   RESULT_VARIABLE _clc_result
   ERROR_VARIABLE _clc_error
 )
diff --git a/libclc/opencl/lib/generic/math/pow.cl b/libclc/opencl/lib/generic/math/pow.cl
index 2fd3a50733ed5..07aa50abe48d6 100644
--- a/libclc/opencl/lib/generic/math/pow.cl
+++ b/libclc/opencl/lib/generic/math/pow.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_pow.h>
+#include "clc/math/clc_pow.h"
 
 #define __CLC_FUNCTION pow
-#define __CLC_BODY <clc/shared/binary_def.inc>
-#include <clc/math/gentype.inc>
+#define __CLC_BODY "clc/shared/binary_def.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __pow_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_pow_fast
+#define __CLC_BODY "clc/shared/binary_def.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/pown.cl b/libclc/opencl/lib/generic/math/pown.cl
index e48bc10a636ab..7f51666530b1a 100644
--- a/libclc/opencl/lib/generic/math/pown.cl
+++ b/libclc/opencl/lib/generic/math/pown.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_pown.h>
+#include "clc/math/clc_pown.h"
 
 #define __CLC_FUNCTION pown
 #define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __pown_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_pown_fast
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/powr.cl b/libclc/opencl/lib/generic/math/powr.cl
index 168e30aa57b08..e74297589a2b9 100644
--- a/libclc/opencl/lib/generic/math/powr.cl
+++ b/libclc/opencl/lib/generic/math/powr.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_powr.h>
+#include "clc/math/clc_powr.h"
 
 #define __CLC_FUNCTION powr
 #define __CLC_BODY <clc/shared/binary_def.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __powr_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_powr_fast
+#define __CLC_BODY <clc/shared/binary_def.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/math/rootn.cl b/libclc/opencl/lib/generic/math/rootn.cl
index 8c9c7f4cc72f5..1c329e49dedaf 100644
--- a/libclc/opencl/lib/generic/math/rootn.cl
+++ b/libclc/opencl/lib/generic/math/rootn.cl
@@ -6,8 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <clc/math/clc_rootn.h>
+#include "clc/math/clc_rootn.h"
 
 #define __CLC_FUNCTION rootn
 #define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
-#include <clc/math/gentype.inc>
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+#undef __CLC_IMPL_FUNCTION
+
+#define __CLC_FLOAT_ONLY
+#define __CLC_FUNCTION __rootn_fast
+#define __CLC_IMPL_FUNCTION(x) __clc_rootn_fast
+#define __CLC_BODY <clc/shared/binary_def_with_int_second_arg.inc>
+#include "clc/math/gentype.inc"
diff --git a/libclc/test/CMakeLists.txt b/libclc/test/CMakeLists.txt
index cd0253f73d43a..3b16b5ab3f651 100644
--- a/libclc/test/CMakeLists.txt
+++ b/libclc/test/CMakeLists.txt
@@ -2,7 +2,7 @@ set(LIBCLC_LIBRARY_DIR ${LIBCLC_OUTPUT_LIBRARY_DIR})
 set(LLVM_TOOLS_DIR ${LLVM_TOOLS_BINARY_DIR})
 
 set(LIBCLC_TEST_DEPS
-  llvm-dis
+  ${llvm-dis_target}
 )
 
 umbrella_lit_testsuite_begin(check-libclc)
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 0ec6d196e9772..e1620f91a453c 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -4,6 +4,12 @@
 cmake_minimum_required(VERSION 3.20.0)
 set(LLVM_SUBPROJECT_TITLE "libc++")
 
+# Unset the C++ Standard: other projects sometimes override this global property, which
+# is incompatible with libc++ (since we set our own standard via other means).
+# TODO: Figure out which other part of the build is setting this and resolve the issue
+#       at the root.
+unset(CMAKE_CXX_STANDARD CACHE)
+
 set(LLVM_COMMON_CMAKE_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/../cmake")
 
 # Add path for custom modules
@@ -502,11 +508,13 @@ remove_flags(-Wno-pedantic -pedantic-errors -pedantic)
 # Required flags ==============================================================
 function(cxx_add_basic_build_flags target)
 
-  # Use C++23 for all targets.
-  set_target_properties(${target} PROPERTIES
-    CXX_STANDARD 23
-    CXX_STANDARD_REQUIRED OFF # TODO: Make this REQUIRED once we don't need to accommodate the LLVM documentation builders using an ancient CMake
-    CXX_EXTENSIONS NO)
+  # Use C++26 for all targets. Note that we don't use CXX_STANDARD or cxx_std_foo
+  # since that requires a newer CMake than is available.
+  if (LIBCXX_TARGETING_CLANG_CL)
+    target_compile_options(${target} PRIVATE /std:c++latest)
+  else()
+    target_compile_options(${target} PRIVATE -std=c++26)
+  endif()
 
   # When building the dylib, don't warn for unavailable aligned allocation
   # functions based on the deployment target -- they are always available
diff --git a/libcxx/include/__format/formatter_floating_point.h b/libcxx/include/__format/formatter_floating_point.h
index b200bc58baa21..f4de8b927651a 100644
--- a/libcxx/include/__format/formatter_floating_point.h
+++ b/libcxx/include/__format/formatter_floating_point.h
@@ -32,12 +32,12 @@
 #include <__format/formatter_output.h>
 #include <__format/parser_std_format_spec.h>
 #include <__iterator/concepts.h>
+#include <__math/traits.h>
 #include <__memory/allocator.h>
 #include <__system_error/errc.h>
 #include <__type_traits/conditional.h>
 #include <__utility/move.h>
 #include <__utility/unreachable.h>
-#include <cmath>
 
 #if _LIBCPP_HAS_LOCALIZATION
 #  include <__locale>
@@ -637,10 +637,10 @@ _LIBCPP_HIDE_FROM_ABI auto __write_using_trailing_zeros(
 template <floating_point _Tp, class _CharT, class _FormatContext>
 _LIBCPP_HIDE_FROM_ABI typename _FormatContext::iterator
 __format_floating_point(_Tp __value, _FormatContext& __ctx, __format_spec::__parsed_specifications<_CharT> __specs) {
-  bool __negative = std::signbit(__value);
+  bool __negative = __math::signbit(__value);
 
-  if (!std::isfinite(__value)) [[unlikely]]
-    return __formatter::__format_floating_point_non_finite(__ctx.out(), __specs, __negative, std::isnan(__value));
+  if (!__math::isfinite(__value)) [[unlikely]]
+    return __formatter::__format_floating_point_non_finite(__ctx.out(), __specs, __negative, __math::isnan(__value));
 
   // Depending on the std-format-spec string the sign and the value
   // might not be outputted together:
diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h
index c0e719bb581b6..7a80dce648fa1 100644
--- a/libcxx/include/__functional/operations.h
+++ b/libcxx/include/__functional/operations.h
@@ -379,8 +379,8 @@ struct less<void> {
   typedef void is_transparent;
 };
 
-template <class _Tp>
-struct __make_transparent<_Tp, less<_Tp> > {
+template <class _ArgumentType>
+struct __make_transparent<_ArgumentType, less<_ArgumentType> > {
   using type _LIBCPP_NODEBUG = less<>;
 };
 
@@ -477,8 +477,8 @@ struct greater<void> {
 template <class _Tp, class _Up>
 inline const bool __desugars_to_v<__greater_tag, greater<>, _Tp, _Up> = true;
 
-template <class _Tp>
-struct __make_transparent<_Tp, greater<_Tp>> {
+template <class _ArgumentType>
+struct __make_transparent<_ArgumentType, greater<_ArgumentType>> {
   using type _LIBCPP_NODEBUG = greater<>;
 };
 
diff --git a/libcxx/include/__type_traits/make_transparent.h b/libcxx/include/__type_traits/make_transparent.h
index c2edf126d4990..d3bca66841072 100644
--- a/libcxx/include/__type_traits/make_transparent.h
+++ b/libcxx/include/__type_traits/make_transparent.h
@@ -22,29 +22,30 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 
 // __make_transparent tries to create a transparent comparator from its non-transparent counterpart, e.g. obtain
 // `less<>` from `less<T>`. This is useful in cases where conversions can be avoided (e.g. a string literal to a
-// std::string).
+// std::string). This depends on the argument type provided to the comparator, because a comparator might be
+// transparent for some argument types but not for others.
 
-template <class _Tp, class _Comparator>
+template <class _ArgumentType, class _Comparator>
 struct __make_transparent {
   using type _LIBCPP_NODEBUG = _Comparator;
 };
 
-template <class _Tp, class _Comparator>
-using __make_transparent_t _LIBCPP_NODEBUG = typename __make_transparent<_Tp, _Comparator>::type;
+template <class _ArgumentType, class _Comparator>
+using __make_transparent_t _LIBCPP_NODEBUG = typename __make_transparent<_ArgumentType, _Comparator>::type;
 
-template <class _Tp,
+template <class _ArgumentType,
           class _Comparator,
-          __enable_if_t<is_same<_Comparator, __make_transparent_t<_Tp, _Comparator> >::value, int> = 0>
+          __enable_if_t<is_same<_Comparator, __make_transparent_t<_ArgumentType, _Comparator> >::value, int> = 0>
 _LIBCPP_HIDE_FROM_ABI _Comparator& __as_transparent(_Comparator& __comp) {
   return __comp;
 }
 
-template <class _Tp,
+template <class _ArgumentType,
           class _Comparator,
-          __enable_if_t<!is_same<_Comparator, __make_transparent_t<_Tp, _Comparator> >::value, int> = 0>
-_LIBCPP_HIDE_FROM_ABI __make_transparent_t<_Tp, _Comparator> __as_transparent(_Comparator&) {
+          __enable_if_t<!is_same<_Comparator, __make_transparent_t<_ArgumentType, _Comparator> >::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI __make_transparent_t<_ArgumentType, _Comparator> __as_transparent(_Comparator&) {
   static_assert(is_empty<_Comparator>::value);
-  return __make_transparent_t<_Tp, _Comparator>();
+  return __make_transparent_t<_ArgumentType, _Comparator>();
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/format b/libcxx/include/format
index f8f54df8352b8..2246f062040ae 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -233,12 +233,15 @@ namespace std {
 #    pragma GCC system_header
 #  endif
 
+#  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 23
+#    include <cmath>
+#  endif
+
 #  if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #    include <array>
 #    include <cctype>
 #    include <cerrno>
 #    include <clocale>
-#    include <cmath>
 #    include <cstddef>
 #    include <cstdint>
 #    include <cstdlib>
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 12716c7f496b1..253cf64703076 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -354,7 +354,6 @@ format cctype
 format cerrno
 format climits
 format clocale
-format cmath
 format compare
 format cstddef
 format cstdint
@@ -508,7 +507,6 @@ iostream cctype
 iostream cerrno
 iostream climits
 iostream clocale
-iostream cmath
 iostream compare
 iostream cstddef
 iostream cstdint
@@ -694,7 +692,6 @@ ostream cctype
 ostream cerrno
 ostream climits
 ostream clocale
-ostream cmath
 ostream compare
 ostream cstddef
 ostream cstdint
@@ -724,7 +721,6 @@ print cctype
 print cerrno
 print climits
 print clocale
-print cmath
 print compare
 print cstddef
 print cstdint
@@ -995,7 +991,6 @@ syncstream cctype
 syncstream cerrno
 syncstream climits
 syncstream clocale
-syncstream cmath
 syncstream compare
 syncstream cstddef
 syncstream cstdint
diff --git a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
index 58cf2e0fe338b..62bad17a21600 100644
--- a/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.ref/compare_exchange_weak.pass.cpp
@@ -33,11 +33,11 @@ struct TestCompareExchangeWeak {
       std::atomic_ref<T> const a(x);
 
       T t(T(1));
-      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(2));
-      assert(y == true);
+      while (!a.compare_exchange_weak(t, T(2))) {
+      }
       assert(a == T(2));
       assert(t == T(1));
-      y = a.compare_exchange_weak(t, T(3));
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(3));
       assert(y == false);
       assert(a == T(2));
       assert(t == T(2));
@@ -49,11 +49,11 @@ struct TestCompareExchangeWeak {
       std::atomic_ref<T> const a(x);
 
       T t(T(1));
-      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(2), std::memory_order_seq_cst);
-      assert(y == true);
+      while (!a.compare_exchange_weak(t, T(2), std::memory_order_seq_cst)) {
+      }
       assert(a == T(2));
       assert(t == T(1));
-      y = a.compare_exchange_weak(t, T(3), std::memory_order_seq_cst);
+      std::same_as<bool> decltype(auto) y = a.compare_exchange_weak(t, T(3), std::memory_order_seq_cst);
       assert(y == false);
       assert(a == T(2));
       assert(t == T(2));
@@ -65,12 +65,12 @@ struct TestCompareExchangeWeak {
       std::atomic_ref<T> const a(x);
 
       T t(T(1));
-      std::same_as<bool> decltype(auto) y =
-          a.compare_exchange_weak(t, T(2), std::memory_order_release, std::memory_order_relaxed);
-      assert(y == true);
+      while (!a.compare_exchange_weak(t, T(2), std::memory_order_release, std::memory_order_relaxed)) {
+      }
       assert(a == T(2));
       assert(t == T(1));
-      y = a.compare_exchange_weak(t, T(3), std::memory_order_release, std::memory_order_relaxed);
+      std::same_as<bool> decltype(auto) y =
+          a.compare_exchange_weak(t, T(3), std::memory_order_release, std::memory_order_relaxed);
       assert(y == false);
       assert(a == T(2));
       assert(t == T(2));
diff --git a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
index 521a60c0fc498..3663575e33e55 100644
--- a/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
+++ b/libcxx/test/tools/clang_tidy_checks/CMakeLists.txt
@@ -25,8 +25,6 @@ endif()
 
 message(STATUS "Found system-installed LLVM ${LLVM_PACKAGE_VERSION} with headers in ${LLVM_INCLUDE_DIRS}")
 
-set(CMAKE_CXX_STANDARD 20)
-
 # Link only against clangTidy itself, not anything that clangTidy uses; otherwise we run setup code multiple times
 # which results in clang-tidy crashing
 set_target_properties(clangTidy PROPERTIES INTERFACE_LINK_LIBRARIES "")
diff --git a/libcxx/utils/ci/docker/docker-compose.yml b/libcxx/utils/ci/docker/docker-compose.yml
index 6dd0881ac7b14..f4cb0b935a261 100644
--- a/libcxx/utils/ci/docker/docker-compose.yml
+++ b/libcxx/utils/ci/docker/docker-compose.yml
@@ -33,6 +33,6 @@ services:
       dockerfile: libcxx/utils/ci/docker/android-builder.dockerfile
       args:
         BASE_IMAGE_VERSION: e9437d01e4a95c0752937b9a35121457b5835afa
-        ANDROID_CLANG_VERSION: r563880
-        ANDROID_CLANG_PREBUILTS_COMMIT: 6ae4184bb8706f9731569b9a0a82be3fcdcb951c
+        ANDROID_CLANG_VERSION: r584948b
+        ANDROID_CLANG_PREBUILTS_COMMIT: 2b062008b0a7be59ad85f012cfeee60f052808f1
         ANDROID_SYSROOT_COMMIT: f8b85cc5262c6e5cbc9a92c1bab2b18b32a4c63f
diff --git a/libcxx/utils/ci/lnt/runners/README.md b/libcxx/utils/ci/lnt/runners/README.md
new file mode 100644
index 0000000000000..13668947b27ea
--- /dev/null
+++ b/libcxx/utils/ci/lnt/runners/README.md
@@ -0,0 +1,7 @@
+## Libc++ LNT runners
+
+This directory defines some LNT runners for tracking libc++ performance. A runner can be run with
+
+```
+bash <(curl -Ls https://raw.githubusercontent.com/llvm/llvm-project/main/libcxx/utils/ci/lnt/runners/RUNNER) <ARGS...>
+```
diff --git a/libcxx/utils/ci/lnt/runners/apple-m5-clang21 b/libcxx/utils/ci/lnt/runners/apple-m5-clang21
new file mode 100755
index 0000000000000..a901bffda6d97
--- /dev/null
+++ b/libcxx/utils/ci/lnt/runners/apple-m5-clang21
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+
+#
+# Run on macOS against Homebrew Clang
+#
+
+if [ -z "${1}" ] || [ ! -d "${1}" ]; then
+    echo "usage: ${0} <path-to-llvm-monorepo>"
+    echo "error: Please provide a valid path to the LLVM monorepo."
+    exit 1
+fi
+
+MONOREPO_DIR=$(cd "${1}" && pwd)
+
+export SDKROOT=$(xcrun --show-sdk-path)
+
+while true; do
+    ${MONOREPO_DIR}/libcxx/utils/ci/lnt/commit-watch --git-repo ${MONOREPO_DIR}                                 \
+        --lnt-url http://lnt.llvm.org --test-suite libcxx2 --machine apple-m5-clang21 |                         \
+        while read commit; do                                                                                   \
+            ${MONOREPO_DIR}/libcxx/utils/ci/lnt/run-benchmarks                                                  \
+                --test-suite-commit 0eefb2682bf8c04954c46e91916b5164d8424702                                    \
+                --git-repo ${MONOREPO_DIR}                                                                      \
+                --lnt-url http://lnt.llvm.org                                                                   \
+                --test-suite libcxx2                                                                            \
+                --machine apple-m5-clang21                                                                      \
+                --compiler $(brew --prefix)/opt/llvm/bin/clang++                                                \
+                --benchmark-commit ${commit};                                                                   \
+        done
+    sleep 60 # To avoid busy looping in case something goes really wrong
+done
diff --git a/libcxx/utils/ci/lnt/runners/apple-m5-xcode26 b/libcxx/utils/ci/lnt/runners/apple-m5-xcode26
new file mode 100755
index 0000000000000..98c664e0c9b43
--- /dev/null
+++ b/libcxx/utils/ci/lnt/runners/apple-m5-xcode26
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+#
+# Run on macOS against Xcode-provided Clang
+#
+
+if [ -z "${1}" ] || [ ! -d "${1}" ]; then
+    echo "usage: ${0} <path-to-llvm-monorepo>"
+    echo "error: Please provide a valid path to the LLVM monorepo."
+    exit 1
+fi
+
+MONOREPO_DIR=$(cd "${1}" && pwd)
+
+while true; do
+    ${MONOREPO_DIR}/libcxx/utils/ci/lnt/commit-watch --git-repo ${MONOREPO_DIR}                                 \
+        --lnt-url http://lnt.llvm.org --test-suite libcxx2 --machine apple-m5-xcode26 |                         \
+        while read commit; do                                                                                   \
+            ${MONOREPO_DIR}/libcxx/utils/ci/lnt/run-benchmarks                                                  \
+                --test-suite-commit 0eefb2682bf8c04954c46e91916b5164d8424702                                    \
+                --git-repo ${MONOREPO_DIR}                                                                      \
+                --lnt-url http://lnt.llvm.org                                                                   \
+                --test-suite libcxx2                                                                            \
+                --machine apple-m5-xcode26                                                                      \
+                --compiler clang++                                                                              \
+                --benchmark-commit ${commit};                                                                   \
+        done
+    sleep 60 # To avoid busy looping in case something goes really wrong
+done
diff --git a/lld/ELF/Arch/LoongArch.cpp b/lld/ELF/Arch/LoongArch.cpp
index 90641de736c74..4c9ea3deece2d 100644
--- a/lld/ELF/Arch/LoongArch.cpp
+++ b/lld/ELF/Arch/LoongArch.cpp
@@ -1686,7 +1686,8 @@ bool LoongArch::relaxOnce(int pass) const {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage))
-      changed |= relax(ctx, *sec);
+      if (sec->relaxAux)
+        changed |= relax(ctx, *sec);
   }
   return changed;
 }
@@ -1698,6 +1699,8 @@ void LoongArch::finalizeRelax(int passes) const {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage)) {
+      if (!sec->relaxAux)
+        continue;
       RelaxAux &aux = *sec->relaxAux;
       if (!aux.relocDeltas)
         continue;
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index 19baa6119f23c..0ded3bb859a37 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -834,6 +834,8 @@ void elf::initSymbolAnchors(Ctx &ctx) {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage)) {
+      if (isa<SyntheticSection>(sec))
+        continue;
       sec->relaxAux = make<RelaxAux>();
       if (sec->relocs().size()) {
         sec->relaxAux->relocDeltas =
@@ -878,6 +880,8 @@ void elf::initSymbolAnchors(Ctx &ctx) {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage)) {
+      if (!sec->relaxAux)
+        continue;
       llvm::sort(sec->relaxAux->anchors, [](auto &a, auto &b) {
         return std::make_pair(a.offset, a.end) <
                std::make_pair(b.offset, b.end);
@@ -1108,7 +1112,8 @@ bool RISCV::relaxOnce(int pass) const {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage))
-      changed |= relax(ctx, pass, *sec);
+      if (sec->relaxAux)
+        changed |= relax(ctx, pass, *sec);
   }
   return changed;
 }
@@ -1232,6 +1237,8 @@ void RISCV::finalizeRelax(int passes) const {
     if (!(osec->flags & SHF_EXECINSTR))
       continue;
     for (InputSection *sec : getInputSections(*osec, storage)) {
+      if (!sec->relaxAux)
+        continue;
       RelaxAux &aux = *sec->relaxAux;
       if (!aux.relocDeltas)
         continue;
diff --git a/lld/test/ELF/loongarch-relax-synthetic-in-text.s b/lld/test/ELF/loongarch-relax-synthetic-in-text.s
new file mode 100644
index 0000000000000..67c355e5a7595
--- /dev/null
+++ b/lld/test/ELF/loongarch-relax-synthetic-in-text.s
@@ -0,0 +1,31 @@
+# REQUIRES: loongarch
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc --filetype=obj -triple=loongarch64 -mattr=+relax %t/a.s -o %t/a.o
+
+## Do not crash when we encounter a synthetic section (like .got) that has
+## been placed inside an executable output section via a linker script.
+## Synthetic sections do not have relaxAux data structures initialized.
+
+# RUN: ld.lld -T %t/a.ld %t/a.o -o %t/a.out
+# RUN: llvm-objdump -s %t/a.out | FileCheck %s
+
+# CHECK:      Contents of section .text:
+# CHECK-NEXT: 0400001a 8440c002 10000000 00000000
+
+#--- a.s
+.global _start
+_start:
+  pcalau12i $a0, %got_pc_hi20(sym)
+  ld.d $a0, $a0, %got_pc_lo12(sym)
+
+.data
+sym:
+  .word 0
+
+#--- a.ld
+SECTIONS {
+  .text : {
+    *(.text)
+    *(.got)
+  }
+}
diff --git a/lld/test/ELF/riscv-relax-synthetic-in-text.s b/lld/test/ELF/riscv-relax-synthetic-in-text.s
new file mode 100644
index 0000000000000..d892da3fdd3a4
--- /dev/null
+++ b/lld/test/ELF/riscv-relax-synthetic-in-text.s
@@ -0,0 +1,33 @@
+# REQUIRES: riscv
+# RUN: rm -rf %t && split-file %s %t
+# RUN: llvm-mc --filetype=obj -triple=riscv64 -mattr=+relax %t/a.s -o %t/a.o
+
+## Do not crash when we encounter a synthetic section (like .got) that has
+## been placed inside an executable output section via a linker script.
+## Synthetic sections do not have relaxAux data structures initialized.
+
+# RUN: ld.lld -T %t/a.ld %t/a.o -o %t/a.out
+# RUN: llvm-objdump -s %t/a.out | FileCheck %s
+
+# CHECK:      Contents of section .text:
+# CHECK-NEXT: 17050000 03350501 00000000 00000000
+# CHECK-NEXT: 18000000 00000000
+
+#--- a.s
+.global _start
+_start:
+1:
+  auipc a0, %got_pcrel_hi(sym)
+  ld a0, %pcrel_lo(1b)(a0)
+
+.data
+sym:
+  .word 0
+
+#--- a.ld
+SECTIONS {
+  .text : {
+    *(.text)
+    *(.got)
+  }
+}
diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index 01738b29c7d8b..3f75bffab0078 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -166,15 +166,19 @@ endif()
 if (APPLE)
   set(default_enable_mte OFF)
 
-  execute_process(
-      COMMAND sysctl -n hw.optional.arm.FEAT_MTE4
-      OUTPUT_VARIABLE SYSCTL_OUTPUT
-      ERROR_QUIET
-      RESULT_VARIABLE SYSCTL_RESULT
-      OUTPUT_STRIP_TRAILING_WHITESPACE
-  )
-  if(SYSCTL_RESULT EQUAL 0 AND SYSCTL_OUTPUT STREQUAL "1")
-    set(default_enable_mte ON)
+  # The MTE launcher complicates injecting the sanitizer runtime libraries.
+  # Default to OFF when any sanitizer is enabled.
+  if (NOT LLVM_USE_SANITIZER)
+    execute_process(
+        COMMAND sysctl -n hw.optional.arm.FEAT_MTE4
+        OUTPUT_VARIABLE SYSCTL_OUTPUT
+        ERROR_QUIET
+        RESULT_VARIABLE SYSCTL_RESULT
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+    )
+    if(SYSCTL_RESULT EQUAL 0 AND SYSCTL_OUTPUT STREQUAL "1")
+      set(default_enable_mte ON)
+    endif()
   endif()
 
   option(LLDB_ENABLE_MTE "Run the LLDB test suite with MTE enabled." ${default_enable_mte})
@@ -182,6 +186,8 @@ if (APPLE)
   if (LLDB_ENABLE_MTE)
     message(STATUS "Running the LLDB test suite with MTE")
   endif()
+else()
+  set(LLDB_ENABLE_MTE OFF)
 endif()
 
 if (LLDB_ENABLE_PYTHON)
diff --git a/lldb/examples/darwin/heap_find/heap.py b/lldb/examples/darwin/heap_find/heap.py
index e575be56b29c0..3ba349703c21d 100644
--- a/lldb/examples/darwin/heap_find/heap.py
+++ b/lldb/examples/darwin/heap_find/heap.py
@@ -32,6 +32,7 @@ def get_iterate_memory_expr(options, process, user_init_code, user_return_code):
 typedef int kern_return_t;
 #define KERN_SUCCESS 0
 typedef void (*range_callback_t)(task_t, void *, unsigned, uintptr_t, uintptr_t);
+task_t task = 0;
 """
     if options.search_vm_regions:
         expr += """
@@ -130,7 +131,7 @@ def get_iterate_memory_expr(options, process, user_init_code, user_return_code):
     return KERN_SUCCESS;
 };
 vm_address_t *zones = 0;
-unsigned int num_zones = 0;task_t task = 0;
+unsigned int num_zones = 0;
 kern_return_t err = (kern_return_t)malloc_get_all_zones (task, task_peek, &zones, &num_zones);
 if (KERN_SUCCESS == err)
 {
@@ -1305,7 +1306,24 @@ def get_sections_ranges_struct(process):
                 base = section.GetLoadAddress(target)
                 size = section.GetByteSize()
                 if base != lldb.LLDB_INVALID_ADDRESS and size > 0:
-                    segment_dicts.append({"base": base, "size": size})
+                    # Walk VM regions across the section and only include
+                    # readable portions, since runtime permissions may
+                    # differ from the Mach-O section permissions.
+                    addr = base
+                    end = base + size
+                    while addr < end:
+                        region_info = lldb.SBMemoryRegionInfo()
+                        if not process.GetMemoryRegionInfo(addr, region_info).Success():
+                            break
+                        region_end = region_info.GetRegionEnd()
+                        if region_end <= addr:
+                            break
+                        chunk_end = min(region_end, end)
+                        if region_info.IsReadable():
+                            segment_dicts.append(
+                                {"base": addr, "size": chunk_end - addr}
+                            )
+                        addr = chunk_end
     segment_dicts_len = len(segment_dicts)
     if segment_dicts_len > 0:
         result = """
diff --git a/lldb/include/lldb/Core/Module.h b/lldb/include/lldb/Core/Module.h
index f46b3a13f4f18..ea875ac68cde2 100644
--- a/lldb/include/lldb/Core/Module.h
+++ b/lldb/include/lldb/Core/Module.h
@@ -510,8 +510,7 @@ class Module : public std::enable_shared_from_this<Module>,
   ///     \b true if it is, \b false otherwise.
   bool IsLoadedInTarget(Target *target);
 
-  bool LoadScriptingResourceInTarget(Target *target, Status &error,
-                                     Stream &feedback_stream);
+  bool LoadScriptingResourceInTarget(Target *target, Status &error);
 
   /// Get the number of compile units for this module.
   ///
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index 740b986444a93..e147eeac61952 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -488,7 +488,6 @@ class ModuleList {
   bool IsEmpty() const { return !GetSize(); }
 
   bool LoadScriptingResourcesInTarget(Target *target, std::list<Status> &errors,
-                                      Stream &feedback_stream,
                                       bool continue_on_error = true);
 
   static ModuleListProperties &GetGlobalModuleListProperties();
diff --git a/lldb/include/lldb/Host/Config.h.cmake b/lldb/include/lldb/Host/Config.h.cmake
index f165d99830d8d..06ab9b9a7775c 100644
--- a/lldb/include/lldb/Host/Config.h.cmake
+++ b/lldb/include/lldb/Host/Config.h.cmake
@@ -51,6 +51,8 @@
 
 #cmakedefine01 LLDB_ENABLE_TREESITTER
 
+#cmakedefine01 LLDB_ENABLE_MTE
+
 #cmakedefine LLDB_PYTHON_HOME R"(${LLDB_PYTHON_HOME})"
 
 #define LLDB_INSTALL_LIBDIR_BASENAME "${LLDB_INSTALL_LIBDIR_BASENAME}"
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index 4f5b022765f9e..7907ed2f1a5f6 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -1115,10 +1115,9 @@ class Target : public std::enable_shared_from_this<Target>,
       LoadDependentFiles load_dependent_files = eLoadDependentsDefault);
 
   bool LoadScriptingResources(std::list<Status> &errors,
-                              Stream &feedback_stream,
                               bool continue_on_error = true) {
-    return m_images.LoadScriptingResourcesInTarget(
-        this, errors, feedback_stream, continue_on_error);
+    return m_images.LoadScriptingResourcesInTarget(this, errors,
+                                                   continue_on_error);
   }
 
   /// Get accessor for the images for this process.
diff --git a/lldb/include/lldb/ValueObject/DILEval.h b/lldb/include/lldb/ValueObject/DILEval.h
index a6223c4d998ab..da4479036c175 100644
--- a/lldb/include/lldb/ValueObject/DILEval.h
+++ b/lldb/include/lldb/ValueObject/DILEval.h
@@ -89,6 +89,13 @@ class Interpreter : Visitor {
   llvm::Expected<CompilerType> ArithmeticConversion(lldb::ValueObjectSP &lhs,
                                                     lldb::ValueObjectSP &rhs,
                                                     uint32_t location);
+  /// Add or subtract the offset to the pointer according to the pointee type
+  /// byte size.
+  /// \returns A new `ValueObject` with a new pointer value.
+  llvm::Expected<lldb::ValueObjectSP> PointerOffset(lldb::ValueObjectSP ptr,
+                                                    lldb::ValueObjectSP offset,
+                                                    BinaryOpKind operation,
+                                                    uint32_t location);
   llvm::Expected<lldb::ValueObjectSP> EvaluateScalarOp(BinaryOpKind kind,
                                                        lldb::ValueObjectSP lhs,
                                                        lldb::ValueObjectSP rhs,
diff --git a/lldb/packages/Python/lldbsuite/test/configuration.py b/lldb/packages/Python/lldbsuite/test/configuration.py
index f96fd31b17a37..002d775594ff5 100644
--- a/lldb/packages/Python/lldbsuite/test/configuration.py
+++ b/lldb/packages/Python/lldbsuite/test/configuration.py
@@ -147,6 +147,9 @@
 # A plugin whose tests will be enabled, like intel-pt.
 enabled_plugins = []
 
+# Whether MTE (Memory Tagging Extension) is enabled.
+mte_enabled = False
+
 # the build type of lldb
 # Typical values include Debug, Release, RelWithDebInfo and MinSizeRel
 cmake_build_type = None
diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 2c62ba03f39bc..cb3aa1d6af115 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -1145,6 +1145,17 @@ def is_running_under_asan():
     return None
 
 
+def is_running_under_mte():
+    if configuration.mte_enabled:
+        return "MTE unsupported"
+    return None
+
+
+def skipIfMTE(func):
+    """Skip this test when running with MTE (Memory Tagging Extension) enabled."""
+    return skipTestIfFn(is_running_under_mte)(func)
+
+
 def skipUnlessAddressSanitizer(func):
     """Decorate the item to skip test unless Clang -fsanitize=thread is supported."""
 
@@ -1293,3 +1304,33 @@ def skipIfBuildType(types: list[str]):
         and configuration.cmake_build_type.lower() in types,
         "skip on {} build type(s)".format(", ".join(types)),
     )
+
+
+def skipUnlessArm64eSupported(func):
+    """Decorate the item to skip test unless Clang can target arm64e."""
+
+    def can_build_and_run_arm64e():
+        arch = lldbplatformutil.getArchitecture()
+
+        # We need to be running the test suite for arm64 or arm64e. If we're
+        # running the whole test suite as arm64e, we don't need any additional
+        # checks.
+        if arch == "arm64e":
+            return None
+        elif arch != "arm64":
+            return "Not targeting arm64"
+
+        # Need at least macOS Tahoe (26) to run arm64e binaries.
+        if platform.mac_ver()[0] == "" or _check_expected_version(
+            "<", "26.0", platform.mac_ver()[0]
+        ):
+            return "Host cannot run arm64e binaries"
+
+        # Need a compiler that can target arm64e.
+        compiler_path = lldbplatformutil.getCompiler()
+        if not _compiler_supports(compiler_path, "-arch arm64e"):
+            return "Compiler cannot target arm64e"
+
+        return None
+
+    return skipTestIfFn(can_build_and_run_arm64e)(func)
diff --git a/lldb/packages/Python/lldbsuite/test/dotest.py b/lldb/packages/Python/lldbsuite/test/dotest.py
index 533be0a065e3a..2bfef9c83aed8 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest.py
@@ -469,6 +469,9 @@ def parseOptionsAndInitTestdirs():
     if args.enabled_plugins:
         configuration.enabled_plugins = args.enabled_plugins
 
+    if args.enable_mte:
+        configuration.mte_enabled = True
+
     # Gather all the dirs passed on the command line.
     if len(args.args) > 0:
         configuration.testdirs = [
diff --git a/lldb/packages/Python/lldbsuite/test/dotest_args.py b/lldb/packages/Python/lldbsuite/test/dotest_args.py
index fce9e41cb5385..8f4b623c4b1c9 100644
--- a/lldb/packages/Python/lldbsuite/test/dotest_args.py
+++ b/lldb/packages/Python/lldbsuite/test/dotest_args.py
@@ -279,6 +279,12 @@ def create_parser():
         metavar="A plugin whose tests will be enabled",
         help="A plugin whose tests will be enabled. The only currently supported plugin is intel-pt.",
     )
+    group.add_argument(
+        "--enable-mte",
+        dest="enable_mte",
+        action="store_true",
+        help="Indicate that the test suite is running with MTE (Memory Tagging Extension) enabled.",
+    )
 
     # Configuration options
     group = parser.add_argument_group("Remote platform options")
diff --git a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
index 9061327df8ad2..0d609dde6fd2a 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbplatformutil.py
@@ -166,11 +166,27 @@ def platformIsDarwin():
     return getPlatform() in getDarwinOSTriples()
 
 
+def getDarwinEmbeddedKernelVersion():
+    """Returns the major kernel version of the remote device via 'uname -r'."""
+    shell_cmd = lldb.SBPlatformShellCommand("uname -r")
+    err = lldb.selected_platform.Run(shell_cmd)
+    if err.Success():
+        output = shell_cmd.GetOutput()
+        if output:
+            try:
+                return int(output.strip().split(".")[0])
+            except ValueError:
+                pass
+    return 0
+
+
 def findMainThreadCheckerDylib():
     if not platformIsDarwin():
         return ""
 
     if getPlatform() in lldbplatform.translate(lldbplatform.darwin_embedded):
+        if getDarwinEmbeddedKernelVersion() >= 26:
+            return "/usr/lib/libMainThreadChecker.dylib"
         return "/Developer/usr/lib/libMainThreadChecker.dylib"
 
     with os.popen("xcode-select -p") as output:
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index d4b0b29390f92..e4f0679a7d884 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -4402,9 +4402,7 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
           // Make sure we load any scripting resources that may be embedded
           // in the debug info files in case the platform supports that.
           Status error;
-          StreamString feedback_stream;
-          module_sp->LoadScriptingResourceInTarget(target, error,
-                                                   feedback_stream);
+          module_sp->LoadScriptingResourceInTarget(target, error);
           if (error.Fail() && error.AsCString())
             result.AppendWarningWithFormat(
                 "unable to load scripting data for module %s - error "
@@ -4413,8 +4411,6 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
                     .GetFileNameStrippingExtension()
                     .GetCString(),
                 error.AsCString());
-          else if (feedback_stream.GetSize())
-            result.AppendWarning(feedback_stream.GetData());
 
           flush = true;
           result.SetStatus(eReturnStatusSuccessFinishResult);
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index d9cde68b7d5f0..fc099891443df 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -307,13 +307,10 @@ Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx,
       if (target_sp->TargetProperties::GetLoadScriptFromSymbolFile() ==
           eLoadScriptFromSymFileTrue) {
         std::list<Status> errors;
-        StreamString feedback_stream;
-        if (!target_sp->LoadScriptingResources(errors, feedback_stream)) {
+        if (!target_sp->LoadScriptingResources(errors)) {
           lldb::StreamUP s = GetAsyncErrorStream();
           for (auto &error : errors)
             s->Printf("%s\n", error.AsCString());
-          if (feedback_stream.GetSize())
-            s->PutCString(feedback_stream.GetString());
         }
       }
     }
diff --git a/lldb/source/Core/Module.cpp b/lldb/source/Core/Module.cpp
index 23921925a987b..79618ab91e27a 100644
--- a/lldb/source/Core/Module.cpp
+++ b/lldb/source/Core/Module.cpp
@@ -1423,8 +1423,7 @@ bool Module::IsLoadedInTarget(Target *target) {
   return false;
 }
 
-bool Module::LoadScriptingResourceInTarget(Target *target, Status &error,
-                                           Stream &feedback_stream) {
+bool Module::LoadScriptingResourceInTarget(Target *target, Status &error) {
   if (!target) {
     error = Status::FromErrorString("invalid destination Target");
     return false;
@@ -1448,9 +1447,13 @@ bool Module::LoadScriptingResourceInTarget(Target *target, Status &error,
     return false;
   }
 
+  StreamString feedback_stream;
   FileSpecList file_specs = platform_sp->LocateExecutableScriptingResources(
       target, *this, feedback_stream);
 
+  if (!feedback_stream.Empty())
+    debugger.ReportWarning(feedback_stream.GetString().str(), debugger.GetID());
+
   const uint32_t num_specs = file_specs.GetSize();
   if (num_specs == 0)
     return true;
@@ -1467,8 +1470,10 @@ bool Module::LoadScriptingResourceInTarget(Target *target, Status &error,
       continue;
 
     if (should_load == eLoadScriptFromSymFileWarn) {
-      feedback_stream.Format(R"(
-warning: '{0}' contains a debug script. To run this script in this debug session:
+      // clang-format off
+      debugger.ReportWarning(
+          llvm::formatv(
+R"('{0}' contains a debug script. To run this script in this debug session:
 
     command script import "{1}"
 
@@ -1476,8 +1481,10 @@ To run all discovered debug scripts in this session:
 
     settings set target.load-script-from-symbol-file true
 )",
-                             GetFileSpec().GetFileNameStrippingExtension(),
-                             scripting_fspec.GetPath());
+              GetFileSpec().GetFileNameStrippingExtension(),
+              scripting_fspec.GetPath()),
+          debugger.GetID());
+      // clang-format on
 
       return false;
     }
diff --git a/lldb/source/Core/ModuleList.cpp b/lldb/source/Core/ModuleList.cpp
index 898b0a2efb6a4..608d13c2a9fe3 100644
--- a/lldb/source/Core/ModuleList.cpp
+++ b/lldb/source/Core/ModuleList.cpp
@@ -1335,7 +1335,6 @@ bool ModuleList::RemoveSharedModuleIfOrphaned(const ModuleWP module_wp) {
 
 bool ModuleList::LoadScriptingResourcesInTarget(Target *target,
                                                 std::list<Status> &errors,
-                                                Stream &feedback_stream,
                                                 bool continue_on_error) {
   if (!target)
     return false;
@@ -1349,8 +1348,7 @@ bool ModuleList::LoadScriptingResourcesInTarget(Target *target,
   for (auto module : tmp_module_list.ModulesNoLocking()) {
     if (module) {
       Status error;
-      if (!module->LoadScriptingResourceInTarget(target, error,
-                                                 feedback_stream)) {
+      if (!module->LoadScriptingResourceInTarget(target, error)) {
         if (error.Fail() && error.AsCString()) {
           error = Status::FromErrorStringWithFormat(
               "unable to load scripting data for "
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 3a17b4c46a788..cec47d96b33d2 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -1108,6 +1108,27 @@ std::optional<FileSpec> ObjectFilePECOFF::GetDebugLink() {
   return std::nullopt;
 }
 
+std::optional<FileSpec> ObjectFilePECOFF::GetPDBPath() {
+  llvm::StringRef pdb_file;
+  const llvm::codeview::DebugInfo *pdb_info = nullptr;
+  if (llvm::Error Err = m_binary->getDebugPDBInfo(pdb_info, pdb_file)) {
+    // DebugInfo section is corrupt.
+    Log *log = GetLog(LLDBLog::Object);
+    llvm::StringRef file = m_binary->getFileName();
+    LLDB_LOG_ERROR(
+        log, std::move(Err),
+        "Failed to read Codeview record for PDB debug info file ({1}): {0}",
+        file);
+    return std::nullopt;
+  }
+  if (pdb_file.empty()) {
+    // No DebugInfo section present.
+    return std::nullopt;
+  }
+  return FileSpec(pdb_file, FileSpec::GuessPathStyle(pdb_file).value_or(
+                                FileSpec::Style::native));
+}
+
 uint32_t ObjectFilePECOFF::ParseDependentModules() {
   ModuleSP module_sp(GetModule());
   if (!module_sp)
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
index 8002e70e604bb..30bd672dc68f8 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.h
@@ -130,6 +130,8 @@ class ObjectFilePECOFF : public lldb_private::ObjectFile {
   /// contains it.
   std::optional<lldb_private::FileSpec> GetDebugLink();
 
+  std::optional<lldb_private::FileSpec> GetPDBPath();
+
   uint32_t GetDependentModules(lldb_private::FileSpecList &files) override;
 
   lldb_private::Address GetEntryPointAddress() override;
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
index 36c9a8a26825e..2e5b458ffe297 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp
@@ -106,14 +106,14 @@ class SanitizedScriptingModuleName {
 
     if (script_interpreter &&
         script_interpreter->IsReservedWord(m_sanitized_name.c_str())) {
+      m_conflicting_keyword = m_sanitized_name;
       m_sanitized_name.insert(m_sanitized_name.begin(), '_');
-      m_name_is_keyword = true;
     }
   }
 
   /// Returns \c true if this name is a keyword in the associated scripting
   /// language.
-  bool IsKeyword() const { return m_name_is_keyword; }
+  bool IsKeyword() const { return !m_conflicting_keyword.empty(); }
 
   /// Returns \c true if the original name has been sanitized (i.e., required
   /// changes).
@@ -123,14 +123,52 @@ class SanitizedScriptingModuleName {
 
   llvm::StringRef GetSanitizedName() const { return m_sanitized_name; }
   llvm::StringRef GetOriginalName() const { return m_original_name; }
+  llvm::StringRef GetConflictingKeyword() const {
+    return m_conflicting_keyword;
+  }
+
+  /// If we did some replacements of reserved characters, and a
+  /// file with the untampered name exists, then warn the user
+  /// that the file as-is shall not be loaded.
+  void WarnIfInvalidUnsanitizedScriptExists(Stream &os,
+                                            const FileSpec &original_fspec,
+                                            const FileSpec &fspec) const {
+    if (!RequiredSanitization())
+      return;
+
+    // Path to unsanitized script name doesn't exist. Nothing to warn about.
+    if (!FileSystem::Instance().Exists(original_fspec))
+      return;
+
+    std::string reason_for_complaint =
+        IsKeyword() ? llvm::formatv("conflicts with the keyword '{0}'",
+                                    GetConflictingKeyword())
+                          .str()
+                    : "contains reserved characters";
+
+    if (FileSystem::Instance().Exists(fspec))
+       os.Format(
+            "debug script '{0}' cannot be loaded because '{1}' {2}. "
+            "Ignoring '{1}' and loading '{3}' instead.\n",
+            original_fspec.GetPath(), original_fspec.GetFilename(),
+            std::move(reason_for_complaint), fspec.GetFilename());
+    else
+      os.Format(
+            "debug script '{0}' cannot be loaded because '{1}' {2}. "
+            "If you intend to have this script loaded, please rename it to "
+            "'{3}' and retry.\n",
+            original_fspec.GetPath(), original_fspec.GetFilename(),
+            std::move(reason_for_complaint), fspec.GetFilename());
+  }
 
 private:
   llvm::StringRef m_original_name;
   std::string m_sanitized_name;
 
-  /// \c true if m_sanitized_name is a keyword for the ScriptInterpreter
-  /// language associated with this SanitizedScriptingModuleName.
-  bool m_name_is_keyword = false;
+  /// If the m_sanitized_name conflicts with a keyword for the ScriptInterpreter
+  /// language associated with this SanitizedScriptingModuleName, is set to the
+  /// conflicting keyword. Empty otherwise.
+  std::string m_conflicting_keyword;
 };
 } // namespace
 
@@ -277,34 +315,8 @@ FileSpecList PlatformDarwin::LocateExecutableScriptingResourcesFromDSYM(
     FileSpec orig_script_fspec(original_path_string.GetString());
     FileSystem::Instance().Resolve(orig_script_fspec);
 
-    // if we did some replacements of reserved characters, and a
-    // file with the untampered name exists, then warn the user
-    // that the file as-is shall not be loaded
-    if (sanitized_name.RequiredSanitization() &&
-        FileSystem::Instance().Exists(orig_script_fspec)) {
-      const char *reason_for_complaint = sanitized_name.IsKeyword()
-                                             ? "conflicts with a keyword"
-                                             : "contains reserved characters";
-      if (FileSystem::Instance().Exists(script_fspec))
-        feedback_stream.Format(
-            "warning: the symbol file '{0}' contains a debug "
-            "script. However, its name"
-            " '{1}' {2} and as such cannot be loaded. LLDB will"
-            " load '{3}' instead. Consider removing the file with "
-            "the malformed name to"
-            " eliminate this warning.\n",
-            symfile_spec.GetPath(), original_path_string.GetString(),
-            reason_for_complaint, path_string.GetString());
-      else
-        feedback_stream.Format(
-            "warning: the symbol file '{0}' contains a debug "
-            "script. However, its name"
-            " {1} and as such cannot be loaded. If you intend"
-            " to have this script loaded, please rename '{2}' to "
-            "'{3}' and retry.\n",
-            symfile_spec.GetPath(), reason_for_complaint,
-            original_path_string.GetString(), path_string.GetString());
-    }
+    sanitized_name.WarnIfInvalidUnsanitizedScriptExists(
+        feedback_stream, orig_script_fspec, script_fspec);
 
     if (FileSystem::Instance().Exists(script_fspec)) {
       file_list.Append(script_fspec);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index de8be43aa35fd..3de30713ad760 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -25,6 +25,7 @@
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Core/ThreadedCommunication.h"
 #include "lldb/DataFormatters/TypeSummary.h"
+#include "lldb/Host/Config.h"
 #include "lldb/Host/FileSystem.h"
 #include "lldb/Host/HostInfo.h"
 #include "lldb/Host/Pipe.h"
@@ -49,6 +50,7 @@
 #include <cstdlib>
 #include <memory>
 #include <optional>
+#include <stdlib.h>
 #include <string>
 
 using namespace lldb;
@@ -289,6 +291,13 @@ llvm::StringRef ScriptInterpreterPython::GetPluginDescriptionStatic() {
 }
 
 void ScriptInterpreterPython::Initialize() {
+#if LLDB_ENABLE_MTE
+  // Python's allocator (pymalloc) is not aware of Memory Tagging Extension
+  // (MTE) and crashes.
+  // https://bugs.python.org/issue43593
+  setenv("PYTHONMALLOC", "malloc", /*overwrite=*/true);
+#endif
+
   HostInfo::SetSharedLibraryDirectoryHelper(
       ScriptInterpreterPython::SharedLibraryDirectoryHelper);
   PluginManager::RegisterPlugin(
diff --git a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
index 22c4f19242c4d..53fcfeccc1270 100644
--- a/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/NativePDB/SymbolFileNativePDB.cpp
@@ -169,8 +169,6 @@ loadMatchingPDBFile(std::string exe_path, llvm::BumpPtrAllocator &allocator) {
   if (expected_info->getGuid() != guid)
     return nullptr;
 
-  LLDB_LOG(GetLog(LLDBLog::Symbols), "Loading {0} for {1}", pdb->getFilePath(),
-           exe_path);
   return pdb;
 }
 
@@ -398,6 +396,11 @@ uint32_t SymbolFileNativePDB::CalculateAbilities() {
     if (!pdb_file)
       return 0;
 
+    LLDB_LOG(
+        GetLog(LLDBLog::Symbols), "Loading {0} for {1}",
+        pdb_file->getFilePath(),
+        m_objfile_sp->GetModule()->GetObjectFile()->GetFileSpec().GetPath());
+
     auto expected_index = PdbIndex::create(pdb_file);
     if (!expected_index) {
       llvm::consumeError(expected_index.takeError());
diff --git a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
index e35195fec2efc..26b89eaefd37a 100644
--- a/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
+++ b/lldb/source/Plugins/SymbolFile/PDB/SymbolFilePDB.cpp
@@ -239,7 +239,8 @@ uint32_t SymbolFilePDB::CalculateAbilities() {
 
   if (!m_session_up) {
     // Lazily load and match the PDB file, but only do this once.
-    std::string exePath = m_objfile_sp->GetFileSpec().GetPath();
+    std::string exePath =
+        m_objfile_sp->GetModule()->GetObjectFile()->GetFileSpec().GetPath();
     auto error = loadDataForEXE(PDB_ReaderType::DIA, llvm::StringRef(exePath),
                                 m_session_up);
     if (error) {
diff --git a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
index 3b466f71dca58..9b9ec470b86a9 100644
--- a/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
+++ b/lldb/source/Plugins/SymbolLocator/CMakeLists.txt
@@ -6,6 +6,7 @@ set_property(DIRECTORY PROPERTY LLDB_PLUGIN_KIND SymbolLocator)
 # prevents an unstripped binary from being requested from the Debuginfod
 # provider.
 add_subdirectory(Debuginfod)
+add_subdirectory(SymStore)
 add_subdirectory(Default)
 if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
   add_subdirectory(DebugSymbols)
diff --git a/lldb/source/Plugins/SymbolLocator/SymStore/CMakeLists.txt b/lldb/source/Plugins/SymbolLocator/SymStore/CMakeLists.txt
new file mode 100644
index 0000000000000..b0da27f26c6a8
--- /dev/null
+++ b/lldb/source/Plugins/SymbolLocator/SymStore/CMakeLists.txt
@@ -0,0 +1,20 @@
+lldb_tablegen(SymbolLocatorSymStoreProperties.inc -gen-lldb-property-defs
+  SOURCE SymbolLocatorSymStoreProperties.td
+  TARGET LLDBPluginSymbolLocatorSymStorePropertiesGen)
+
+lldb_tablegen(SymbolLocatorSymStorePropertiesEnum.inc -gen-lldb-property-enum-defs
+  SOURCE SymbolLocatorSymStoreProperties.td
+  TARGET LLDBPluginSymbolLocatorSymStorePropertiesEnumGen)
+
+add_lldb_library(lldbPluginSymbolLocatorSymStore PLUGIN
+  SymbolLocatorSymStore.cpp
+
+  LINK_LIBS
+    lldbCore
+    lldbHost
+    lldbSymbol
+  )
+
+add_dependencies(lldbPluginSymbolLocatorSymStore
+  LLDBPluginSymbolLocatorSymStorePropertiesGen
+  LLDBPluginSymbolLocatorSymStorePropertiesEnumGen)
diff --git a/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.cpp b/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.cpp
new file mode 100644
index 0000000000000..d008a7d3e8e9a
--- /dev/null
+++ b/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.cpp
@@ -0,0 +1,147 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SymbolLocatorSymStore.h"
+
+#include "lldb/Core/ModuleList.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Host/FileSystem.h"
+#include "lldb/Interpreter/OptionValueString.h"
+#include "lldb/Utility/Args.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/Utility/UUID.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+LLDB_PLUGIN_DEFINE(SymbolLocatorSymStore)
+
+namespace {
+
+#define LLDB_PROPERTIES_symbollocatorsymstore
+#include "SymbolLocatorSymStoreProperties.inc"
+
+enum {
+#define LLDB_PROPERTIES_symbollocatorsymstore
+#include "SymbolLocatorSymStorePropertiesEnum.inc"
+};
+
+class PluginProperties : public Properties {
+public:
+  static llvm::StringRef GetSettingName() {
+    return SymbolLocatorSymStore::GetPluginNameStatic();
+  }
+
+  PluginProperties() {
+    m_collection_sp = std::make_shared<OptionValueProperties>(GetSettingName());
+    m_collection_sp->Initialize(g_symbollocatorsymstore_properties_def);
+  }
+
+  Args GetURLs() const {
+    Args urls;
+    m_collection_sp->GetPropertyAtIndexAsArgs(ePropertySymStoreURLs, urls);
+    return urls;
+  }
+};
+
+} // namespace
+
+static PluginProperties &GetGlobalPluginProperties() {
+  static PluginProperties g_settings;
+  return g_settings;
+}
+
+SymbolLocatorSymStore::SymbolLocatorSymStore() : SymbolLocator() {}
+
+void SymbolLocatorSymStore::Initialize() {
+  // First version can only locate PDB in local SymStore (no download yet).
+  PluginManager::RegisterPlugin(
+      GetPluginNameStatic(), GetPluginDescriptionStatic(), CreateInstance,
+      nullptr, LocateExecutableSymbolFile, nullptr, nullptr,
+      SymbolLocatorSymStore::DebuggerInitialize);
+}
+
+void SymbolLocatorSymStore::DebuggerInitialize(Debugger &debugger) {
+  if (!PluginManager::GetSettingForSymbolLocatorPlugin(
+          debugger, PluginProperties::GetSettingName())) {
+    constexpr bool is_global_setting = true;
+    PluginManager::CreateSettingForSymbolLocatorPlugin(
+        debugger, GetGlobalPluginProperties().GetValueProperties(),
+        "Properties for the SymStore Symbol Locator plug-in.",
+        is_global_setting);
+  }
+}
+
+void SymbolLocatorSymStore::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
+llvm::StringRef SymbolLocatorSymStore::GetPluginDescriptionStatic() {
+  return "Symbol locator for PDB in SymStore";
+}
+
+SymbolLocator *SymbolLocatorSymStore::CreateInstance() {
+  return new SymbolLocatorSymStore();
+}
+
+// RSDS entries store identity as a 20-byte UUID composed of 16-byte GUID and
+// 4-byte age:
+//   12345678-1234-5678-9ABC-DEF012345678-00000001
+//
+// SymStore key is a string with no separators and age as decimal:
+//   12345678123456789ABCDEF0123456781
+//
+static std::string formatSymStoreKey(const UUID &uuid) {
+  llvm::ArrayRef<uint8_t> bytes = uuid.GetBytes();
+  uint32_t age = llvm::support::endian::read32be(bytes.data() + 16);
+  constexpr bool LowerCase = false;
+  return llvm::toHex(bytes.slice(0, 16), LowerCase) + std::to_string(age);
+}
+
+std::optional<FileSpec> SymbolLocatorSymStore::LocateExecutableSymbolFile(
+    const ModuleSpec &module_spec, const FileSpecList &default_search_paths) {
+  const UUID &uuid = module_spec.GetUUID();
+  if (!uuid.IsValid() ||
+      !ModuleList::GetGlobalModuleListProperties().GetEnableExternalLookup())
+    return {};
+
+  Log *log = GetLog(LLDBLog::Symbols);
+  std::string pdb_name =
+      module_spec.GetSymbolFileSpec().GetFilename().GetStringRef().str();
+  if (pdb_name.empty()) {
+    LLDB_LOGV(log, "Failed to resolve symbol PDB module: PDB name empty");
+    return {};
+  }
+
+  LLDB_LOGV(log, "LocateExecutableSymbolFile {0} with UUID {1}", pdb_name,
+            uuid.GetAsString());
+  if (uuid.GetBytes().size() != 20) {
+    LLDB_LOGV(log, "Failed to resolve symbol PDB module: UUID invalid");
+    return {};
+  }
+
+  std::string key = formatSymStoreKey(uuid);
+  Args sym_store_urls = GetGlobalPluginProperties().GetURLs();
+  for (const Args::ArgEntry &url : sym_store_urls) {
+    llvm::SmallString<256> path;
+    llvm::sys::path::append(path, url.ref(), pdb_name, key, pdb_name);
+    FileSpec spec(path);
+    if (FileSystem::Instance().Exists(spec)) {
+      LLDB_LOGV(log, "Found {0} in SymStore {1}", pdb_name, url.ref());
+      return spec;
+    }
+  }
+
+  return {};
+}
diff --git a/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.h b/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.h
new file mode 100644
index 0000000000000..52ec04cae387b
--- /dev/null
+++ b/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStore.h
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_SYMBOLLOCATOR_SYMSTORE_SYMBOLLOCATORSYMSTORE_H
+#define LLDB_SOURCE_PLUGINS_SYMBOLLOCATOR_SYMSTORE_SYMBOLLOCATORSYMSTORE_H
+
+#include "lldb/Core/Debugger.h"
+#include "lldb/Symbol/SymbolLocator.h"
+#include "lldb/lldb-private.h"
+
+namespace lldb_private {
+
+/// This plugin implements lookup in Microsoft SymStore instances. This can work
+/// cross-platform and for arbitrary debug info formats, but the focus is on PDB
+/// with PE/COFF binaries on Windows.
+class SymbolLocatorSymStore : public SymbolLocator {
+public:
+  SymbolLocatorSymStore();
+
+  static void Initialize();
+  static void Terminate();
+  static void DebuggerInitialize(Debugger &debugger);
+
+  static llvm::StringRef GetPluginNameStatic() { return "symstore"; }
+  static llvm::StringRef GetPluginDescriptionStatic();
+
+  static lldb_private::SymbolLocator *CreateInstance();
+
+  /// PluginInterface protocol.
+  /// \{
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+  /// \}
+
+  // Locate the symbol file given a module specification.
+  //
+  // Locating the file should happen only on the local computer or using the
+  // current computers global settings.
+  static std::optional<FileSpec>
+  LocateExecutableSymbolFile(const ModuleSpec &module_spec,
+                             const FileSpecList &default_search_paths);
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_SYMBOLLOCATOR_SYMSTORE_SYMBOLLOCATORSYMSTORE_H
diff --git a/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStoreProperties.td b/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStoreProperties.td
new file mode 100644
index 0000000000000..0cd631a80b90b
--- /dev/null
+++ b/lldb/source/Plugins/SymbolLocator/SymStore/SymbolLocatorSymStoreProperties.td
@@ -0,0 +1,7 @@
+include "../../../../include/lldb/Core/PropertiesBase.td"
+
+let Definition = "symbollocatorsymstore", Path = "plugin.symbol-locator.symstore" in {
+  def SymStoreURLs : Property<"urls", "Array">,
+    ElementType<"String">,
+    Desc<"List of local symstore directories to query for symbols">;
+}
diff --git a/lldb/source/Plugins/SymbolVendor/PECOFF/SymbolVendorPECOFF.cpp b/lldb/source/Plugins/SymbolVendor/PECOFF/SymbolVendorPECOFF.cpp
index 20ccfa54a106c..87436da443d91 100644
--- a/lldb/source/Plugins/SymbolVendor/PECOFF/SymbolVendorPECOFF.cpp
+++ b/lldb/source/Plugins/SymbolVendor/PECOFF/SymbolVendorPECOFF.cpp
@@ -69,8 +69,20 @@ SymbolVendorPECOFF::CreateInstance(const lldb::ModuleSP &module_sp,
           lldb::eSectionTypeDWARFDebugInfo, true))
     return nullptr;
 
+  // Otherwise, we try to locate it.
+  FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
+
   // If the module specified a filespec, use that.
   FileSpec fspec = module_sp->GetSymbolFileFileSpec();
+  // Otherwise, if this is CodeView, use the PDB path and set the module
+  // directory as the first fallback lookup location.
+  if (!fspec) {
+    if (auto pdb_spec = obj_file->GetPDBPath()) {
+      fspec = *pdb_spec;
+      if (ConstString dir = obj_file->GetFileSpec().GetDirectory())
+        search_paths.Insert(0, FileSpec(dir));
+    }
+  }
   // Otherwise, try gnu_debuglink, if one exists.
   if (!fspec)
     fspec = obj_file->GetDebugLink().value_or(FileSpec());
@@ -84,7 +96,6 @@ SymbolVendorPECOFF::CreateInstance(const lldb::ModuleSP &module_sp,
   FileSystem::Instance().Resolve(module_spec.GetFileSpec());
   module_spec.GetSymbolFileSpec() = fspec;
   module_spec.GetUUID() = uuid;
-  FileSpecList search_paths = Target::GetDefaultDebugFileSearchPaths();
   FileSpec dsym_fspec = PluginManager::LocateExecutableSymbolFile(
       module_spec, search_paths, module_sp->GetSymbolLocatorStatistics());
   if (!dsym_fspec)
@@ -101,31 +112,30 @@ SymbolVendorPECOFF::CreateInstance(const lldb::ModuleSP &module_sp,
   // This objfile is for debugging purposes.
   dsym_objfile_sp->SetType(ObjectFile::eTypeDebugInfo);
 
-  // Get the module unified section list and add our debug sections to
-  // that.
+  // For DWARF get the module unified section list and add our debug sections
+  // to that.
   SectionList *module_section_list = module_sp->GetSectionList();
   SectionList *objfile_section_list = dsym_objfile_sp->GetSectionList();
-  if (!objfile_section_list || !module_section_list)
-    return nullptr;
-
-  static const SectionType g_sections[] = {
-      eSectionTypeDWARFDebugAbbrev,   eSectionTypeDWARFDebugAranges,
-      eSectionTypeDWARFDebugFrame,    eSectionTypeDWARFDebugInfo,
-      eSectionTypeDWARFDebugLine,     eSectionTypeDWARFDebugLoc,
-      eSectionTypeDWARFDebugLocLists, eSectionTypeDWARFDebugMacInfo,
-      eSectionTypeDWARFDebugNames,    eSectionTypeDWARFDebugPubNames,
-      eSectionTypeDWARFDebugPubTypes, eSectionTypeDWARFDebugRanges,
-      eSectionTypeDWARFDebugStr,      eSectionTypeDWARFDebugTypes,
-  };
-  for (SectionType section_type : g_sections) {
-    if (SectionSP section_sp =
-            objfile_section_list->FindSectionByType(section_type, true)) {
-      if (SectionSP module_section_sp =
-              module_section_list->FindSectionByType(section_type, true))
-        module_section_list->ReplaceSection(module_section_sp->GetID(),
-                                            section_sp);
-      else
-        module_section_list->AddSection(section_sp);
+  if (objfile_section_list && module_section_list) {
+    static const SectionType g_sections[] = {
+        eSectionTypeDWARFDebugAbbrev,   eSectionTypeDWARFDebugAranges,
+        eSectionTypeDWARFDebugFrame,    eSectionTypeDWARFDebugInfo,
+        eSectionTypeDWARFDebugLine,     eSectionTypeDWARFDebugLoc,
+        eSectionTypeDWARFDebugLocLists, eSectionTypeDWARFDebugMacInfo,
+        eSectionTypeDWARFDebugNames,    eSectionTypeDWARFDebugPubNames,
+        eSectionTypeDWARFDebugPubTypes, eSectionTypeDWARFDebugRanges,
+        eSectionTypeDWARFDebugStr,      eSectionTypeDWARFDebugTypes,
+    };
+    for (SectionType section_type : g_sections) {
+      if (SectionSP section_sp =
+              objfile_section_list->FindSectionByType(section_type, true)) {
+        if (SectionSP module_section_sp =
+                module_section_list->FindSectionByType(section_type, true))
+          module_section_list->ReplaceSection(module_section_sp->GetID(),
+                                              section_sp);
+        else
+          module_section_list->AddSection(section_sp);
+      }
     }
   }
 
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 9c8124a15333b..296a18f89dba2 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1545,9 +1545,7 @@ Module *Target::GetExecutableModulePointer() {
 static void LoadScriptingResourceForModule(const ModuleSP &module_sp,
                                            Target *target) {
   Status error;
-  StreamString feedback_stream;
-  if (module_sp && !module_sp->LoadScriptingResourceInTarget(target, error,
-                                                             feedback_stream)) {
+  if (module_sp && !module_sp->LoadScriptingResourceInTarget(target, error)) {
     if (error.AsCString())
       target->GetDebugger().GetAsyncErrorStream()->Printf(
           "unable to load scripting data for module %s - error reported was "
@@ -1555,9 +1553,6 @@ static void LoadScriptingResourceForModule(const ModuleSP &module_sp,
           module_sp->GetFileSpec().GetFileNameStrippingExtension().GetCString(),
           error.AsCString());
   }
-  if (feedback_stream.GetSize())
-    target->GetDebugger().GetAsyncErrorStream()->Printf(
-        "%s\n", feedback_stream.GetData());
 }
 
 void Target::ClearModules(bool delete_locations) {
diff --git a/lldb/source/ValueObject/DILEval.cpp b/lldb/source/ValueObject/DILEval.cpp
index ec9ea8b9618fc..e1736e1dc008d 100644
--- a/lldb/source/ValueObject/DILEval.cpp
+++ b/lldb/source/ValueObject/DILEval.cpp
@@ -538,6 +538,43 @@ Interpreter::Visit(const UnaryOpNode &node) {
                                               node.GetLocation());
 }
 
+llvm::Expected<lldb::ValueObjectSP>
+Interpreter::PointerOffset(lldb::ValueObjectSP ptr, lldb::ValueObjectSP offset,
+                           BinaryOpKind operation, uint32_t location) {
+  assert(operation == BinaryOpKind::Add || operation == BinaryOpKind::Sub);
+  if (ptr->GetCompilerType().IsPointerToVoid())
+    return llvm::make_error<DILDiagnosticError>(
+        m_expr, "arithmetic on a pointer to void", location);
+  if (ptr->GetValueAsUnsigned(0) == 0 && offset != 0)
+    return llvm::make_error<DILDiagnosticError>(
+        m_expr, "arithmetic on a nullptr is undefined", location);
+
+  bool success;
+  int64_t offset_int = offset->GetValueAsSigned(0, &success);
+  if (!success) {
+    std::string errMsg = llvm::formatv("could not get the offset: {0}",
+                                       offset->GetError().AsCString());
+    return llvm::make_error<DILDiagnosticError>(m_expr, std::move(errMsg),
+                                                location);
+  }
+
+  llvm::Expected<uint64_t> byte_size =
+      ptr->GetCompilerType().GetPointeeType().GetByteSize(
+          m_exe_ctx_scope.get());
+  if (!byte_size)
+    return byte_size.takeError();
+  uint64_t ptr_addr = ptr->GetValueAsUnsigned(0);
+  if (operation == BinaryOpKind::Sub)
+    ptr_addr -= offset_int * (*byte_size);
+  else
+    ptr_addr += offset_int * (*byte_size);
+
+  ExecutionContext exe_ctx(m_target.get(), false);
+  Scalar scalar(ptr_addr);
+  return ValueObject::CreateValueObjectFromScalar(
+      m_exe_ctx_scope, scalar, ptr->GetCompilerType(), "result");
+}
+
 llvm::Expected<lldb::ValueObjectSP>
 Interpreter::EvaluateScalarOp(BinaryOpKind kind, lldb::ValueObjectSP lhs,
                               lldb::ValueObjectSP rhs, CompilerType result_type,
@@ -569,7 +606,8 @@ llvm::Expected<lldb::ValueObjectSP> Interpreter::EvaluateBinaryAddition(
     lldb::ValueObjectSP lhs, lldb::ValueObjectSP rhs, uint32_t location) {
   // Operation '+' works for:
   //   {scalar,unscoped_enum} <-> {scalar,unscoped_enum}
-  // TODO: Pointer arithmetics
+  //   {integer,unscoped_enum} <-> pointer
+  //   pointer <-> {integer,unscoped_enum}
   auto orig_lhs_type = lhs->GetCompilerType();
   auto orig_rhs_type = rhs->GetCompilerType();
   auto type_or_err = ArithmeticConversion(lhs, rhs, location);
@@ -580,18 +618,34 @@ llvm::Expected<lldb::ValueObjectSP> Interpreter::EvaluateBinaryAddition(
   if (result_type.IsScalarType())
     return EvaluateScalarOp(BinaryOpKind::Add, lhs, rhs, result_type, location);
 
-  std::string errMsg =
-      llvm::formatv("invalid operands to binary expression ('{0}' and '{1}')",
-                    orig_lhs_type.GetTypeName(), orig_rhs_type.GetTypeName());
-  return llvm::make_error<DILDiagnosticError>(m_expr, std::move(errMsg),
-                                              location);
+  // Check for pointer arithmetics.
+  // One of the operands must be a pointer and the other one an integer.
+  lldb::ValueObjectSP ptr, offset;
+  if (lhs->GetCompilerType().IsPointerType()) {
+    ptr = lhs;
+    offset = rhs;
+  } else if (rhs->GetCompilerType().IsPointerType()) {
+    ptr = rhs;
+    offset = lhs;
+  }
+
+  if (!ptr || !offset->GetCompilerType().IsInteger()) {
+    std::string errMsg =
+        llvm::formatv("invalid operands to binary expression ('{0}' and '{1}')",
+                      orig_lhs_type.GetTypeName(), orig_rhs_type.GetTypeName());
+    return llvm::make_error<DILDiagnosticError>(m_expr, std::move(errMsg),
+                                                location);
+  }
+
+  return PointerOffset(ptr, offset, BinaryOpKind::Add, location);
 }
 
 llvm::Expected<lldb::ValueObjectSP> Interpreter::EvaluateBinarySubtraction(
     lldb::ValueObjectSP lhs, lldb::ValueObjectSP rhs, uint32_t location) {
   // Operation '-' works for:
   //   {scalar,unscoped_enum} <-> {scalar,unscoped_enum}
-  // TODO: Pointer arithmetics
+  //   pointer <-> {integer,unscoped_enum}
+  //   pointer <-> pointer (if pointee types are compatible)
   auto orig_lhs_type = lhs->GetCompilerType();
   auto orig_rhs_type = rhs->GetCompilerType();
   auto type_or_err = ArithmeticConversion(lhs, rhs, location);
@@ -602,6 +656,60 @@ llvm::Expected<lldb::ValueObjectSP> Interpreter::EvaluateBinarySubtraction(
   if (result_type.IsScalarType())
     return EvaluateScalarOp(BinaryOpKind::Sub, lhs, rhs, result_type, location);
 
+  auto lhs_type = lhs->GetCompilerType();
+  auto rhs_type = rhs->GetCompilerType();
+
+  // "pointer - integer" operation.
+  if (lhs_type.IsPointerType() && rhs_type.IsInteger())
+    return PointerOffset(lhs, rhs, BinaryOpKind::Sub, location);
+
+  // "pointer - pointer" operation.
+  if (lhs_type.IsPointerType() && rhs_type.IsPointerType()) {
+    if (lhs_type.IsPointerToVoid() && rhs_type.IsPointerToVoid()) {
+      return llvm::make_error<DILDiagnosticError>(
+          m_expr, "arithmetic on pointers to void", location);
+    }
+    // Compare canonical unqualified pointer types.
+    CompilerType lhs_unqualified_type = lhs_type.GetCanonicalType();
+    CompilerType rhs_unqualified_type = rhs_type.GetCanonicalType();
+    if (!lhs_unqualified_type.CompareTypes(rhs_unqualified_type)) {
+      std::string errMsg = llvm::formatv(
+          "'{0}' and '{1}' are not pointers to compatible types",
+          orig_lhs_type.GetTypeName(), orig_rhs_type.GetTypeName());
+      return llvm::make_error<DILDiagnosticError>(m_expr, errMsg, location);
+    }
+
+    llvm::Expected<uint64_t> lhs_byte_size =
+        lhs_type.GetPointeeType().GetByteSize(m_exe_ctx_scope.get());
+    if (!lhs_byte_size)
+      return lhs_byte_size.takeError();
+    // Since pointers have compatible types, both have the same pointee size.
+    int64_t item_size = *lhs_byte_size;
+    int64_t diff = static_cast<int64_t>(lhs->GetValueAsUnsigned(0) -
+                                        rhs->GetValueAsUnsigned(0));
+    assert(item_size > 0 && "Pointee size cannot be 0");
+    if (diff % item_size != 0) {
+      // If address difference isn't divisible by pointee size then performing
+      // the operation is undefined behaviour.
+      return llvm::make_error<DILDiagnosticError>(
+          m_expr, "undefined pointer arithmetic", location);
+    }
+    diff /= item_size;
+
+    llvm::Expected<lldb::TypeSystemSP> type_system =
+        GetTypeSystemFromCU(m_exe_ctx_scope);
+    if (!type_system)
+      return type_system.takeError();
+    CompilerType ptrdiff_type = type_system.get()->GetPointerDiffType(true);
+    if (!ptrdiff_type)
+      return llvm::make_error<DILDiagnosticError>(
+          m_expr, "unable to determine pointer diff type", location);
+
+    Scalar scalar(diff);
+    return ValueObject::CreateValueObjectFromScalar(m_exe_ctx_scope, scalar,
+                                                    ptrdiff_type, "result");
+  }
+
   std::string errMsg =
       llvm::formatv("invalid operands to binary expression ('{0}' and '{1}')",
                     orig_lhs_type.GetTypeName(), orig_rhs_type.GetTypeName());
diff --git a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
index 03075314ab9b6..fb3b877209d16 100644
--- a/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
+++ b/lldb/test/API/commands/frame/var-dil/expr/Arithmetic/TestFrameVarDILArithmetic.py
@@ -102,15 +102,3 @@ def test_arithmetic(self):
         self.expect_var_path("my_ref - 1", value="1")
         self.expect_var_path("ref + my_ref", value="4")
         self.expect_var_path("ref - my_ref", value="0")
-
-        # TODO: Pointer arithmetics
-        self.expect(
-            "frame var -- 'p + 1'",
-            error=True,
-            substrs=["invalid operands to binary expression ('int *' and 'int')"],
-        )
-        self.expect(
-            "frame var -- 'p - 1'",
-            error=True,
-            substrs=["invalid operands to binary expression ('int *' and 'int')"],
-        )
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILExprPointerArithmetic.py b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILExprPointerArithmetic.py
index 448cd5b1ec7e0..5da79a29bad01 100644
--- a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILExprPointerArithmetic.py
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/TestFrameVarDILExprPointerArithmetic.py
@@ -22,8 +22,139 @@ def test_pointer_arithmetic(self):
         self.expect_var_path("+array", type="int *")
         self.expect_var_path("+array_ref", type="int *")
         self.expect_var_path("+p_int0", type="int *")
+
+        # Binary operations
+        self.expect_var_path("p_char", type="const char *")
+        self.expect_var_path("p_char + 1", type="const char *")
+        self.expect_var_path("p_char + offset", type="const char *")
+        self.expect_var_path("p_char5 + -1", type="const char *")
+        self.expect_var_path("p_char5 - 1", type="const char *")
+        self.expect_var_path("p_char5 - offset", type="const char *")
+
+        self.expect_var_path("my_p_char", type="my_char_ptr")
+        self.expect_var_path("my_p_char + 1", type="my_char_ptr")
+        self.expect_var_path("my_p_char - 1", type="my_char_ptr")
+
+        self.expect_var_path("*(p_char + 0)", value="'h'")
+        self.expect_var_path("*(5 + p_char)", value="'!'")
+        self.expect_var_path("*(p_char5 + -5)", value="'h'")
+        self.expect_var_path("*(p_char5 - 5)", value="'h'")
+        self.expect_var_path("*(p_char - -5)", value="'!'")
+        self.expect_var_path("*(p_char5 - offset + 5)", value="'!'")
+        self.expect_var_path("*((p_char + offset) - 5)", value="'h'")
+        self.expect_var_path("*(p_char + (offset - 5))", value="'h'")
+
+        self.expect_var_path("*p_int0", value="0")
+        self.expect_var_path("*cp_int5", value="5")
+        self.expect_var_path("*(&*(cp_int5 + 1) - 1)", value="5")
+
+        self.expect_var_path("p_int0 - p_int0", value="0", type="__ptrdiff_t")
+        self.expect_var_path("cp_int5 - p_int0", value="5", type="__ptrdiff_t")
+        self.expect_var_path("cp_int5 - td_int_ptr0", value="5", type="__ptrdiff_t")
+        self.expect_var_path("td_int_ptr0 - cp_int5", value="-5", type="__ptrdiff_t")
+
+        # Check arrays
+        self.expect_var_path("array + 1", type="int *")
+        self.expect_var_path("1 + array", type="int *")
+        self.expect_var_path("array_ref + 1", type="int *")
+        self.expect_var_path("1 + array_ref", type="int *")
+        self.expect_var_path("array - 1", type="int *")
+        self.expect_var_path("array_ref - 1", type="int *")
+        self.expect_var_path("array - array", value="0", type="__ptrdiff_t")
+        self.expect_var_path("array - array_ref", value="0", type="__ptrdiff_t")
+        self.expect_var_path("array_ref - array_ref", value="0", type="__ptrdiff_t")
+
+        # Errors
         self.expect(
             "frame var -- '-p_int0'",
             error=True,
             substrs=["invalid argument type 'int *' to unary expression"],
         )
+        self.expect(
+            "frame var -- 'cp_int5 - p_char'",
+            error=True,
+            substrs=[
+                "'const int *' and 'const char *' are not pointers to compatible types"
+            ],
+        )
+        self.expect(
+            "frame var -- 'p_int0 + cp_int5'",
+            error=True,
+            substrs=[
+                "invalid operands to binary expression ('int *' and 'const int *')"
+            ],
+        )
+        self.expect(
+            "frame var -- 'p_void + 1'",
+            error=True,
+            substrs=["arithmetic on a pointer to void"],
+        )
+        self.expect(
+            "frame var -- 'p_void - 1'",
+            error=True,
+            substrs=["arithmetic on a pointer to void"],
+        )
+        self.expect(
+            "frame var -- 'p_void - p_char'",
+            error=True,
+            substrs=[
+                "'void *' and 'const char *' are not pointers to compatible types"
+            ],
+        )
+        self.expect(
+            "frame var -- 'p_void - p_void'",
+            error=True,
+            substrs=["arithmetic on pointers to void"],
+        )
+        self.expect(
+            "frame var -- 'pp_void0 - p_char'",
+            error=True,
+            substrs=[
+                "'void **' and 'const char *' are not pointers to compatible types"
+            ],
+        )
+        self.expect(
+            "frame var -- 'p_int0 - 1.0'",
+            error=True,
+            substrs=["invalid operands to binary expression ('int *' and 'double')"],
+        )
+        self.expect(
+            "frame var -- '1.0f + p_int0'",
+            error=True,
+            substrs=["invalid operands to binary expression ('float' and 'int *')"],
+        )
+        self.expect(
+            "frame var -- '1 - array'",
+            error=True,
+            substrs=["invalid operands to binary expression ('int' and 'int[10]')"],
+        )
+        self.expect(
+            "frame var -- 'array + array'",
+            error=True,
+            substrs=["invalid operands to binary expression ('int[10]' and 'int[10]')"],
+        )
+        self.expect(
+            "frame var -- 'array + array'",
+            error=True,
+            substrs=["invalid operands to binary expression ('int[10]' and 'int[10]')"],
+        )
+        self.expect(
+            "frame var -- 'int_null + 1'",
+            error=True,
+            substrs=["arithmetic on a nullptr is undefined"],
+        )
+        self.expect(
+            "frame var -- 'int_null - 1'",
+            error=True,
+            substrs=["arithmetic on a nullptr is undefined"],
+        )
+        self.expect(
+            "frame var -- 'p_char + *((int*) 0)'",
+            error=True,
+            substrs=["could not get the offset: parent is NULL"],
+        )
+        self.expect(
+            "frame var -- 'p_char - *((int*) 0)'",
+            error=True,
+            substrs=["could not get the offset: parent is NULL"],
+        )
diff --git a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp
index b4e0e88b1ffc9..02754e846f754 100644
--- a/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp
+++ b/lldb/test/API/commands/frame/var-dil/expr/PointerArithmetic/main.cpp
@@ -1,11 +1,31 @@
 void stop() {}
 
 int main(int argc, char **argv) {
+  int offset = 5;
   int array[10];
   array[0] = 0;
+  array[offset] = offset;
   int (&array_ref)[10] = array;
   int *p_int0 = &array[0];
 
+  const char *p_char = "hello!";
+  const char *p_char5 = p_char + 5;
+  typedef const char *my_char_ptr;
+  my_char_ptr my_p_char = p_char;
+
+  int **pp_int0 = &p_int0;
+  const int *cp_int0 = &array[0];
+  const int *cp_int5 = &array[offset];
+
+  typedef int *td_int_ptr_t;
+  td_int_ptr_t td_int_ptr0 = &array[0];
+
+  void *p_void = (void *)p_char;
+  void **pp_void0 = &p_void;
+  void **pp_void1 = pp_void0 + 1;
+
+  int *int_null = nullptr;
+
   stop(); // Set a breakpoint here
   return 0;
 }
diff --git a/lldb/test/API/functionalities/data-formatter/synthetic_subscript/main.c b/lldb/test/API/functionalities/data-formatter/synthetic_subscript/main.c
index ca0da120a7c0c..59fd4356d2ed1 100644
--- a/lldb/test/API/functionalities/data-formatter/synthetic_subscript/main.c
+++ b/lldb/test/API/functionalities/data-formatter/synthetic_subscript/main.c
@@ -1,3 +1,5 @@
+#include <stdio.h>
+
 struct Thing {
   int zero;
   int one;
@@ -7,6 +9,6 @@ int main() {
   struct Thing x;
   x.zero = 1;
   x.one = 2;
-  __builtin_printf("break here\n");
+  printf("break here\n");
   return 0;
 }
diff --git a/lldb/test/API/functionalities/ptr_refs/TestPtrRefs.py b/lldb/test/API/functionalities/ptr_refs/TestPtrRefs.py
index 95d81bf8a9992..e638718e5e95e 100644
--- a/lldb/test/API/functionalities/ptr_refs/TestPtrRefs.py
+++ b/lldb/test/API/functionalities/ptr_refs/TestPtrRefs.py
@@ -11,6 +11,7 @@
 
 class TestPtrRefs(TestBase):
     @skipIfAsan  # The output looks different under ASAN.
+    @skipIfMTE  # Heap scanning reads tagged memory with untagged pointers.
     @skipUnlessDarwin
     def test_ptr_refs(self):
         """Test format string functionality."""
diff --git a/lldb/test/API/lang/cpp/global_operators/TestCppGlobalOperators.py b/lldb/test/API/lang/cpp/global_operators/TestCppGlobalOperators.py
index 358ab5bb7e0ee..b659f4aec2a44 100644
--- a/lldb/test/API/lang/cpp/global_operators/TestCppGlobalOperators.py
+++ b/lldb/test/API/lang/cpp/global_operators/TestCppGlobalOperators.py
@@ -85,6 +85,7 @@ def do_new_test(self, frame, expr, expected_value_name):
         self.assertTrue(got_type.IsPointerType())
         self.assertEqual(got_type.GetPointeeType().GetName(), "Struct")
 
+    @skipIfMTE  # Expression evaluation of overridden operator new fails under MTE.
     def test_operator_new(self):
         frame = self.prepare_executable_and_get_frame()
 
diff --git a/lldb/test/API/lang/objc/ptr_refs/TestPtrRefsObjC.py b/lldb/test/API/lang/objc/ptr_refs/TestPtrRefsObjC.py
index 9d8b52ae85420..acecf1981dcf4 100644
--- a/lldb/test/API/lang/objc/ptr_refs/TestPtrRefsObjC.py
+++ b/lldb/test/API/lang/objc/ptr_refs/TestPtrRefsObjC.py
@@ -11,6 +11,7 @@
 
 class TestPtrRefsObjC(TestBase):
     @skipIfAsan  # The output looks different under ASAN.
+    @skipIfMTE  # Heap scanning reads tagged memory with untagged pointers.
     def test_ptr_refs(self):
         """Test the ptr_refs tool on Darwin with Objective-C"""
         self.build()
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index f2a14d1475385..a48cb8c5a9b4a 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -300,6 +300,9 @@ def delete_module_cache(path):
     for plugin in config.enabled_plugins:
         dotest_cmd += ["--enable-plugin", plugin]
 
+if getattr(config, "lldb_enable_mte", False):
+    dotest_cmd += ["--enable-mte"]
+
 # `dotest` args come from three different sources:
 # 1. Derived by CMake based on its configs (LLDB_TEST_COMMON_ARGS), which end
 # up in `dotest_common_args_str`.
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 6cc4542bca75e..0f7630c1e6e44 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -43,6 +43,7 @@ config.libcxx_libs_dir = "@LIBCXX_LIBRARY_DIR@"
 config.libcxx_include_dir = "@LIBCXX_GENERATED_INCLUDE_DIR@"
 config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@"
 config.lldb_launcher = "@LLDB_LAUNCHER@"
+config.lldb_enable_mte = @LLDB_ENABLE_MTE@
 # The API tests use their own module caches.
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
diff --git a/lldb/test/API/lua_api/TestLuaAPI.py b/lldb/test/API/lua_api/TestLuaAPI.py
index e78ed9de72375..1fb64e7fc20d0 100644
--- a/lldb/test/API/lua_api/TestLuaAPI.py
+++ b/lldb/test/API/lua_api/TestLuaAPI.py
@@ -157,6 +157,7 @@ def get_tests(self):
                     tests.append(filename)
         return tests
 
+    @skipIfMTE  # Lua is not MTE-aware.
     def test_lua_api(self):
         if "LUA_EXECUTABLE" not in os.environ or not os.path.exists(
             os.environ["LUA_EXECUTABLE"]
diff --git a/lldb/test/API/macosx/mte/TestDarwinMTE.py b/lldb/test/API/macosx/mte/TestDarwinMTE.py
index a70b4b4aed26b..59e8cd8d388c1 100644
--- a/lldb/test/API/macosx/mte/TestDarwinMTE.py
+++ b/lldb/test/API/macosx/mte/TestDarwinMTE.py
@@ -5,6 +5,7 @@
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
 from lldbsuite.test import lldbutil
+from lldbsuite.test import configuration
 import lldbsuite.test.cpu_feature as cpu_feature
 
 exe_name = "uaf"  # Must match Makefile
@@ -18,7 +19,15 @@ def test_process_launch_memory_tagging(self):
         self.build(make_targets=["binary-plain"])
         self.createTestTarget(self.getBuildArtifact(exe_name))
 
-        self.expect("process launch", substrs=["exited with status = 0"])
+        if configuration.mte_enabled:
+            # When running under the MTE launcher, MTE is inherited by child
+            # processes, so even without --memory-tagging the UAF is caught.
+            self.expect(
+                "process launch",
+                substrs=["stopped", "stop reason = EXC_ARM_MTE_TAG_FAULT"],
+            )
+        else:
+            self.expect("process launch", substrs=["exited with status = 0"])
 
         self.expect(
             "process launch --memory-tagging",
diff --git a/lldb/test/API/symstore/Makefile b/lldb/test/API/symstore/Makefile
new file mode 100644
index 0000000000000..c9319d6e6888a
--- /dev/null
+++ b/lldb/test/API/symstore/Makefile
@@ -0,0 +1,2 @@
+C_SOURCES := main.c
+include Makefile.rules
diff --git a/lldb/test/API/symstore/TestSymStoreLocal.py b/lldb/test/API/symstore/TestSymStoreLocal.py
new file mode 100644
index 0000000000000..48e7361aab09c
--- /dev/null
+++ b/lldb/test/API/symstore/TestSymStoreLocal.py
@@ -0,0 +1,123 @@
+import os
+import shutil
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+
+"""
+Test debug symbol acquisition from a local SymStore repository. This can work
+cross-platform and for arbitrary debug info formats. We only support PDB
+currently.
+"""
+
+
+class MockedSymStore:
+    """
+    Context Manager to populate a file structure equivalent to SymStore.exe in a
+    temporary directory.
+    """
+
+    def __init__(self, test, exe, pdb):
+        self._test = test
+        self._exe = exe
+        self._pdb = pdb
+        self._tmp = None
+
+    def get_key_pdb(self, exe):
+        """
+        Module UUID: 12345678-1234-5678-9ABC-DEF012345678-00000001
+        To SymStore key: 12345678123456789ABCDEF0123456781
+        """
+        spec = lldb.SBModuleSpec()
+        spec.SetFileSpec(lldb.SBFileSpec(self._test.getBuildArtifact(exe)))
+        module = lldb.SBModule(spec)
+        raw = module.GetUUIDString().replace("-", "").upper()
+        if len(raw) != 40:
+            raise RuntimeError("Unexpected number of bytes in embedded UUID")
+        guid_hex = raw[:32]
+        age = int(raw[32:], 16)
+        return guid_hex + str(age)
+
+    def __enter__(self):
+        """
+        Mock local symstore directory tree, move PDB there and report path.
+        """
+        key = None
+        if self._test.getDebugInfo() == "pdb":
+            key = self.get_key_pdb(self._exe)
+        self._test.assertIsNotNone(key)
+        self._tmp = self._test.getBuildArtifact("tmp")
+        pdb_dir = os.path.join(self._tmp, self._pdb, key)
+        os.makedirs(pdb_dir, exist_ok=True)
+        shutil.move(
+            self._test.getBuildArtifact(self._pdb),
+            os.path.join(pdb_dir, self._pdb),
+        )
+        return self._tmp
+
+    def __exit__(self, *exc_info):
+        """
+        Clean up and delete original exe so next make won't skip link command.
+        """
+        shutil.rmtree(self._tmp)
+        os.remove(self._test.getBuildArtifact(self._exe))
+        self._test.runCmd("settings clear plugin.symbol-locator.symstore")
+
+
+class SymStoreLocalTests(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
+    def build_inferior(self):
+        if self.getDebugInfo() != "pdb":
+            self.skipTest("Non-PDB debug info variants not yet supported")
+        self.build()
+        exe_file = "a.out"
+        sym_file = "a.pdb"
+        self.assertTrue(os.path.isfile(self.getBuildArtifact(exe_file)))
+        self.assertTrue(os.path.isfile(self.getBuildArtifact(sym_file)))
+        return exe_file, sym_file
+
+    def try_breakpoint(self, exe, should_have_loc, ext_lookup=True):
+        enable = "true" if ext_lookup else "false"
+        self.runCmd(f"settings set symbols.enable-external-lookup {enable}")
+        target = self.dbg.CreateTarget(self.getBuildArtifact(exe))
+        self.assertTrue(target and target.IsValid(), "Target is valid")
+        bp = target.BreakpointCreateByName("func")
+        self.assertTrue(bp and bp.IsValid(), "Breakpoint is valid")
+        self.assertEqual(bp.GetNumLocations(), 1 if should_have_loc else 0)
+        self.dbg.DeleteTarget(target)
+
+    def test_no_symstore_url(self):
+        """
+        Check that breakpoint doesn't resolve without SymStore.
+        """
+        exe, sym = self.build_inferior()
+        with MockedSymStore(self, exe, sym):
+            self.try_breakpoint(exe, should_have_loc=False)
+            self.runCmd("quit", check=False)
+
+    def test_external_lookup_off(self):
+        """
+        Check that breakpoint doesn't resolve with external lookup disabled.
+        """
+        exe, sym = self.build_inferior()
+        with MockedSymStore(self, exe, sym) as symstore_dir:
+            self.runCmd(
+                f"settings set plugin.symbol-locator.symstore.urls {symstore_dir}"
+            )
+            self.try_breakpoint(exe, ext_lookup=False, should_have_loc=False)
+            self.runCmd("quit", check=False)
+
+    def test_local_dir(self):
+        """
+        Check that breakpoint resolves with local SymStore.
+        """
+        exe, sym = self.build_inferior()
+        with MockedSymStore(self, exe, sym) as symstore_dir:
+            self.runCmd(
+                f"settings set plugin.symbol-locator.symstore.urls {symstore_dir}"
+            )
+            self.try_breakpoint(exe, should_have_loc=True)
+            self.runCmd("quit", check=False)
diff --git a/lldb/test/API/symstore/main.c b/lldb/test/API/symstore/main.c
new file mode 100644
index 0000000000000..a95762e80ea44
--- /dev/null
+++ b/lldb/test/API/symstore/main.c
@@ -0,0 +1,5 @@
+int func(int argc, const char *argv[]) {
+  return (argc + 1) * (argv[argc][0] + 2);
+}
+
+int main(int argc, const char *argv[]) { return func(0, argv); }
diff --git a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
index 6e0514a0019a4..020d87d63aaf2 100644
--- a/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
+++ b/lldb/test/API/tools/lldb-dap/variables/TestDAP_variables.py
@@ -733,7 +733,7 @@ def test_return_variables(self):
 
         return_name = "(Return Value)"
         verify_locals = {
-            return_name: {"equals": {"type": "int", "value": "300"}},
+            return_name: {"equals": {"type": "int", "value": "300"}, "readOnly": True},
             "argc": {},
             "argv": {},
             "pt": {"readOnly": True},
diff --git a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
index 865e6fb7d6c01..3dc11a3ff74f2 100644
--- a/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
+++ b/lldb/test/API/tools/lldb-server/TestAppleSimulatorOSType.py
@@ -9,6 +9,7 @@
 import re
 
 
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class TestAppleSimulatorOSType(gdbremote_testcase.GdbRemoteTestCaseBase):
     SHARED_BUILD_TESTCASE = False
 
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py
index 5aa790b1c97e5..7059302048396 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteAttach.py
@@ -5,6 +5,7 @@
 from lldbsuite.test import lldbutil
 
 
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class TestGdbRemoteAttach(gdbremote_testcase.GdbRemoteTestCaseBase):
     def test_attach_with_vAttach(self):
         self.build()
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteProcessInfo.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteProcessInfo.py
index 398264a70a417..c780be62d517c 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteProcessInfo.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteProcessInfo.py
@@ -5,6 +5,7 @@
 from lldbsuite.test import lldbutil
 
 
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class TestGdbRemoteProcessInfo(gdbremote_testcase.GdbRemoteTestCaseBase):
     def test_qProcessInfo_returns_running_process(self):
         self.build()
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py
index 39aa473322a9f..8ffe8a6aba6c8 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteRegisterState.py
@@ -4,6 +4,7 @@
 from lldbsuite.test import lldbutil
 
 
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class TestGdbRemoteRegisterState(gdbremote_testcase.GdbRemoteTestCaseBase):
     """Test QSaveRegisterState/QRestoreRegisterState support."""
 
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemoteThreadsInStopReply.py b/lldb/test/API/tools/lldb-server/TestGdbRemoteThreadsInStopReply.py
index 17fc8c1a0f08c..fb563faf98efc 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemoteThreadsInStopReply.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemoteThreadsInStopReply.py
@@ -7,6 +7,7 @@
 from lldbsuite.test import lldbutil
 
 
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class TestGdbRemoteThreadsInStopReply(gdbremote_testcase.GdbRemoteTestCaseBase):
     ENABLE_THREADS_IN_STOP_REPLY_ENTRIES = [
         "read packet: $QListThreadsInStopReply#21",
diff --git a/lldb/test/API/tools/lldb-server/TestGdbRemote_qMemoryRegion.py b/lldb/test/API/tools/lldb-server/TestGdbRemote_qMemoryRegion.py
index 1a5df1a250184..5119ddaf8365f 100644
--- a/lldb/test/API/tools/lldb-server/TestGdbRemote_qMemoryRegion.py
+++ b/lldb/test/API/tools/lldb-server/TestGdbRemote_qMemoryRegion.py
@@ -4,6 +4,7 @@
 from lldbsuite.test.lldbdwarf import *
 
 
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class TestGdbRemote_qMemoryRegion(gdbremote_testcase.GdbRemoteTestCaseBase):
     def test_qMemoryRegionInfo_is_supported(self):
         self.build()
diff --git a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
index f1c0519ae56d8..717c4da171eeb 100644
--- a/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
+++ b/lldb/test/API/tools/lldb-server/TestLldbGdbServer.py
@@ -25,6 +25,7 @@
 # On Linux systems with Yama ptrace_scope = 1 there is a race condition when the
 # debugee enables tracing. See https://github.com/llvm/llvm-project/issues/161510.
 @skipIfLinux
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class LldbGdbServerTestCase(
     gdbremote_testcase.GdbRemoteTestCaseBase, DwarfOpcodeParser
 ):
diff --git a/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py b/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py
index 84aab9c969aa4..bbbfa733db2da 100644
--- a/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py
+++ b/lldb/test/API/tools/lldb-server/attach-wait/TestGdbRemoteAttachWait.py
@@ -8,6 +8,7 @@
 from lldbsuite.test import lldbutil
 
 
+ at skipIfMTE  # MTE security transition shims restrict socket operations.
 class TestGdbRemoteAttachWait(gdbremote_testcase.GdbRemoteTestCaseBase):
     def _set_up_inferior(self):
         self._exe_to_attach = "%s_%d" % (self.testMethodName, os.getpid())
diff --git a/lldb/test/API/windows/launch/replace-dll/TestReplaceDLL.py b/lldb/test/API/windows/launch/replace-dll/TestReplaceDLL.py
index afa97cf4afe50..91111702169ae 100644
--- a/lldb/test/API/windows/launch/replace-dll/TestReplaceDLL.py
+++ b/lldb/test/API/windows/launch/replace-dll/TestReplaceDLL.py
@@ -8,6 +8,8 @@
 
 
 class ReplaceDllTestCase(TestBase):
+    SHARED_BUILD_TESTCASE = False
+
     @skipUnlessWindows
     def test(self):
         """
diff --git a/lldb/test/CMakeLists.txt b/lldb/test/CMakeLists.txt
index d7d745a512002..1edfed8c66427 100644
--- a/lldb/test/CMakeLists.txt
+++ b/lldb/test/CMakeLists.txt
@@ -255,6 +255,7 @@ set(LLDB_TEST_SHELL_DISABLE_REMOTE OFF CACHE BOOL "Disable remote Shell tests ex
 # These values are not canonicalized within LLVM.
 llvm_canonicalize_cmake_booleans(
   LLDB_BUILD_INTEL_PT
+  LLDB_ENABLE_MTE
   LLDB_ENABLE_PYTHON
   LLDB_ENABLE_LUA
   LLDB_ENABLE_LZMA
diff --git a/lldb/test/Shell/Diagnostics/TestDump.test b/lldb/test/Shell/Diagnostics/TestDump.test
index 2adde6b86d35a..477a4c03a13b1 100644
--- a/lldb/test/Shell/Diagnostics/TestDump.test
+++ b/lldb/test/Shell/Diagnostics/TestDump.test
@@ -5,11 +5,9 @@
 # RUN: rm -rf %t.existing
 # RUN: mkdir -p %t.existing
 # RUN: %lldb -o 'diagnostics dump -d %t.existing'
-# RUN: file %t.existing | FileCheck %s
+# RUN: test -d %t.existing
 
 # Dump to a non-existing directory.
 # RUN: rm -rf %t.nonexisting
 # RUN: %lldb -o 'diagnostics dump -d %t.nonexisting'
-# RUN: file %t.nonexisting | FileCheck %s
-
-# CHECK: directory
+# RUN: test -d %t.nonexisting
diff --git a/lldb/test/Shell/Heap/heap-cstr.test b/lldb/test/Shell/Heap/heap-cstr.test
index cabf6d6a25aab..a559a978f6b2f 100644
--- a/lldb/test/Shell/Heap/heap-cstr.test
+++ b/lldb/test/Shell/Heap/heap-cstr.test
@@ -1,5 +1,7 @@
 # REQUIRES: system-darwin
 # REQUIRES: python
+# Heap scanning reads tagged memory with untagged pointers.
+# UNSUPPORTED: lldb-mte
 # RUN: %clang_host %p/Inputs/cstr.c -g -o %t
 # RUN: %lldb -b -s %s -f %t | FileCheck %s
 
diff --git a/lldb/test/Shell/Platform/AutoLoad/Darwin/dsym-python-script-name-warnings.test b/lldb/test/Shell/Platform/AutoLoad/Darwin/dsym-python-script-name-warnings.test
new file mode 100644
index 0000000000000..9c84045d75932
--- /dev/null
+++ b/lldb/test/Shell/Platform/AutoLoad/Darwin/dsym-python-script-name-warnings.test
@@ -0,0 +1,39 @@
+# REQUIRES: python, system-darwin
+
+# Tests that LLDB prints warning messages that occur while locating scripts in dSYMs.
+
+# RUN: split-file %s %t
+
+## Module name contains reserved characters but no script with a corrected
+## name exists.
+
+# RUN: %clang_host -g %t/main.c -o "%t/Test-Module.out"
+# RUN: mkdir -p "%t/Test-Module.out.dSYM/Contents/Resources/Python"
+# RUN: touch "%t/Test-Module.out.dSYM/Contents/Resources/Python/Test-Module.py"
+# RUN: %lldb -b \
+# RUN:   -o 'target create "%t/Test-Module.out"' 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=CHECK-RENAME
+
+# CHECK-RENAME:      warning: {{.*}} 'Test-Module.py' contains reserved characters
+# CHECK-RENAME-SAME: If you intend to have this script loaded, please rename
+
+## Module name contains reserved characters but a script with a corrected
+## name does exists.
+
+# RUN: %clang_host -g %t/main.c -o "%t/Test-Module2.out"
+# RUN: mkdir -p "%t/Test-Module2.out.dSYM/Contents/Resources/Python"
+# RUN: touch "%t/Test-Module2.out.dSYM/Contents/Resources/Python/Test_Module2.py"
+# RUN: touch "%t/Test-Module2.out.dSYM/Contents/Resources/Python/Test-Module2.py"
+# RUN: %lldb -b \
+# RUN:   -o 'target create "%t/Test-Module2.out"' 2>&1 \
+# RUN:   | FileCheck %s --check-prefix=CHECK-REMOVE
+
+# CHECK-REMOVE:      warning: {{.*}} 'Test-Module2.py' contains reserved characters
+# CHECK-REMOVE-SAME: Ignoring 'Test-Module2.py' and loading 'Test_Module2.py' instead.
+
+## Also confirm that the warning message about auto-loading scripts is printed afterwards.
+
+# CHECK-REMOVE: warning: 'Test-Module2' contains a debug script. To run this script in this
+
+#--- main.c
+int main() { return 0; }
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/structured-bindings-msvc.test b/lldb/test/Shell/SymbolFile/NativePDB/structured-bindings-msvc.test
index dfc654fd72ae3..ebe6cceda7ea6 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/structured-bindings-msvc.test
+++ b/lldb/test/Shell/SymbolFile/NativePDB/structured-bindings-msvc.test
@@ -1,4 +1,4 @@
-# REQUIRES: lld, msvc
+# REQUIRES: lld, msvc, target-windows
 
 # Test that LLDB can show variables introduced in C++ 17 structured bindings
 # when compiled with MSVC.
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/vbases.test b/lldb/test/Shell/SymbolFile/NativePDB/vbases.test
index e7ba38a462768..789fbbe1b1a55 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/vbases.test
+++ b/lldb/test/Shell/SymbolFile/NativePDB/vbases.test
@@ -4,11 +4,11 @@
 
 # RUN: split-file %s %t
 
-# RUN: %clang_cl --target=x86_64-windows-msvc -Z7 -c /Fo%t.cv.obj -- %t/main.cpp
-# RUN: %clang_cl --target=x86_64-windows-msvc -Z7 -gdwarf -c /Fo%t.dwarf.obj -- %t/main.cpp
+# RUN: %clang_cl_host -Z7 -c /GS- /Fo%t.cv.obj -- %t/main.cpp
+# RUN: %clang_cl_host -Z7 -gdwarf -c /GS- /Fo%t.dwarf.obj -- %t/main.cpp
 
-# RUN: lld-link -debug %t.cv.obj -out:%t.cv.exe
-# RUN: lld-link -debug %t.dwarf.obj -out:%t.dwarf.exe
+# RUN: lld-link -debug -nodefaultlib -entry:main %t.cv.obj -out:%t.cv.exe
+# RUN: lld-link -debug -nodefaultlib -entry:main %t.dwarf.obj -out:%t.dwarf.exe
 
 # RUN: %lldb -f %t.cv.exe -s %t/commands.input 2>&1 | FileCheck %s
 # RUN: %lldb -f %t.dwarf.exe -s %t/commands.input 2>&1 | FileCheck %s
diff --git a/lldb/test/Shell/Target/dependent-modules-nodupe-windows.test b/lldb/test/Shell/Target/dependent-modules-nodupe-windows.test
index 78d7f7469b9f8..9c507dc9079b7 100644
--- a/lldb/test/Shell/Target/dependent-modules-nodupe-windows.test
+++ b/lldb/test/Shell/Target/dependent-modules-nodupe-windows.test
@@ -4,9 +4,10 @@
 # process actually loads the DLL.
 
 # RUN: %clang_host -g0 -O0 -shared %S/Inputs/shlib.c -o %t.shlib.dll \
-# RUN:             %if windows-msvc %{-Wl,-implib:%t.shlib.lib%} \
+# RUN:             %if windows-msvc %{-Wl,-implib:%t.shlib.lib -Wl,-debug:none%} \
 # RUN:             %else %{-Wl,--out-implib=%t.shlib.lib%}
-# RUN: %clang_host -g0 -O0 %S/Inputs/main.c %t.shlib.lib -o %t.main.exe
+# RUN: %clang_host -g0 -O0 %S/Inputs/main.c %t.shlib.lib -o %t.main.exe \
+# RUN:             %if windows-msvc %{-Wl,-debug:none%}
 # RUN: %lldb -b -o "#before" -o "target modules list" -o "b main" -o run \
 # RUN:       -o "#after" -o "target modules list" %t.main.exe | FileCheck %s
 
diff --git a/lldb/test/Shell/lit.cfg.py b/lldb/test/Shell/lit.cfg.py
index cdc0cfe51f7c6..8d28f2d5201b3 100644
--- a/lldb/test/Shell/lit.cfg.py
+++ b/lldb/test/Shell/lit.cfg.py
@@ -144,6 +144,9 @@ def calculate_arch_features(arch_string):
 if config.lldb_enable_python:
     config.available_features.add("python")
 
+if getattr(config, "lldb_enable_mte", False):
+    config.available_features.add("lldb-mte")
+
 if config.lldb_enable_lua:
     config.available_features.add("lua")
 
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index b260b2fce90b7..546df37775361 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -36,6 +36,7 @@ config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 config.lldb_has_lldbrpc = @LLDB_BUILD_LLDBRPC@
 config.have_dia_sdk = @LLVM_ENABLE_DIA_SDK@
 config.lldb_launcher = "@LLDB_LAUNCHER@"
+config.lldb_enable_mte = @LLDB_ENABLE_MTE@
 # The shell tests use their own module caches.
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
diff --git a/lldb/tools/driver/CMakeLists.txt b/lldb/tools/driver/CMakeLists.txt
index 7043f648518d6..1c93ed9fad927 100644
--- a/lldb/tools/driver/CMakeLists.txt
+++ b/lldb/tools/driver/CMakeLists.txt
@@ -9,11 +9,23 @@ if(APPLE)
     )
   # Inline info plist in binary (use target_link_options for this as soon as CMake 3.13 is available)
   set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-sectcreate,__TEXT,__info_plist,${CMAKE_CURRENT_BINARY_DIR}/lldb-Info.plist")
+
+  if(LLDB_CODESIGN_IDENTITY)
+    set(LLVM_CODESIGNING_IDENTITY ${LLDB_CODESIGN_IDENTITY})
+  elseif(NOT LLVM_CODESIGNING_IDENTITY)
+    set(LLVM_CODESIGNING_IDENTITY "-")
+  endif()
+
+  # If we're building with assertions and LLDB_ENABLE_MTE is set to ON,
+  # build the driver to always run under MTE.
+  if (LLDB_ENABLE_MTE AND LLVM_ENABLE_ASSERTIONS)
+    set(ENTITLEMENTS ENTITLEMENTS ${CMAKE_CURRENT_SOURCE_DIR}/lldb-mte-entitlements.plist)
+  endif()
 endif()
 
 if (UNIX AND "${CMAKE_SYSTEM_NAME}" MATCHES "AIX")
-  remove_definitions("-D_XOPEN_SOURCE=700")                                    
-  add_definitions("-D_ALL_SOURCE")                                             
+  remove_definitions("-D_XOPEN_SOURCE=700")
+  add_definitions("-D_ALL_SOURCE")
 endif()
 
 set(LLDB_DRIVER_LINK_LIBS
@@ -30,6 +42,8 @@ add_lldb_tool(lldb
   Driver.cpp
   Platform.cpp
 
+  ${ENTITLEMENTS}
+
   LINK_COMPONENTS
     Option
     Support
diff --git a/lldb/tools/driver/lldb-mte-entitlements.plist b/lldb/tools/driver/lldb-mte-entitlements.plist
new file mode 100644
index 0000000000000..7ab4dd621bd53
--- /dev/null
+++ b/lldb/tools/driver/lldb-mte-entitlements.plist
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+	<key>com.apple.developer.hardened-process</key>
+	<true/>
+	<key>com.apple.developer.hardened-process.checked-allocations</key>
+	<true/>
+</dict>
+</plist>
diff --git a/lldb/tools/lldb-dap/ProtocolUtils.cpp b/lldb/tools/lldb-dap/ProtocolUtils.cpp
index 99bd5abf0b756..bc997b1da9bd7 100644
--- a/lldb/tools/lldb-dap/ProtocolUtils.cpp
+++ b/lldb/tools/lldb-dap/ProtocolUtils.cpp
@@ -300,7 +300,8 @@ Variable CreateVariable(lldb::SBValue v, var_ref_t var_ref, bool format_hex,
     var.memoryReference = addr;
 
   bool is_readonly = v.GetType().IsAggregateType() ||
-                     v.GetValueType() == lldb::eValueTypeRegisterSet;
+                     v.GetValueType() == lldb::eValueTypeRegisterSet ||
+                     var.name == "(Return Value)";
   if (is_readonly) {
     if (!var.presentationHint)
       var.presentationHint = {VariablePresentationHint()};
diff --git a/lldb/tools/lldb-server/lldb-gdbserver.cpp b/lldb/tools/lldb-server/lldb-gdbserver.cpp
index fd48124a59bbe..c5c0d4b97e935 100644
--- a/lldb/tools/lldb-server/lldb-gdbserver.cpp
+++ b/lldb/tools/lldb-server/lldb-gdbserver.cpp
@@ -110,10 +110,10 @@ static void sighup_handler(MainLoopBase &mainloop) {
 
 llvm::Error handle_attach_to_pid(GDBRemoteCommunicationServerLLGS &gdb_server,
                                  lldb::pid_t pid) {
-  Status error = gdb_server.AttachToProcess(pid);
-  if (error.Fail())
+  Status status = gdb_server.AttachToProcess(pid);
+  if (status.Fail())
     return llvm::createStringErrorV("failed to attach to pid {0}: {1}", pid,
-                                    error.AsCString());
+                                    status.AsCString());
   return llvm::Error::success();
 }
 
@@ -160,10 +160,10 @@ llvm::Error handle_launch(GDBRemoteCommunicationServerLLGS &gdb_server,
 
   gdb_server.SetLaunchInfo(info);
 
-  Status error = gdb_server.LaunchProcess();
-  if (error.Fail())
+  Status status = gdb_server.LaunchProcess();
+  if (status.Fail())
     return llvm::createStringErrorV("failed to launch '{0}': {1}", Arguments[0],
-                                    error);
+                                    status);
 
   return llvm::Error::success();
 }
@@ -206,7 +206,7 @@ llvm::Error ConnectToRemote(MainLoop &mainloop,
                             const char *const subcommand,
                             const char *const named_pipe_path,
                             pipe_t unnamed_pipe, shared_fd_t connection_fd) {
-  Status error;
+  Status status;
 
   std::unique_ptr<Connection> connection_up;
   std::string url;
@@ -214,10 +214,10 @@ llvm::Error ConnectToRemote(MainLoop &mainloop,
   if (connection_fd != SharedSocket::kInvalidFD) {
 #ifdef _WIN32
     NativeSocket sockfd;
-    error = SharedSocket::GetNativeSocket(connection_fd, sockfd);
-    if (error.Fail())
+    status = SharedSocket::GetNativeSocket(connection_fd, sockfd);
+    if (status.Fail())
       return llvm::createStringErrorV("GetNativeSocket failed: {0}",
-                                      error.AsCString());
+                                      status.AsCString());
     connection_up = std::make_unique<ConnectionFileDescriptor>(
         std::make_unique<TCPSocket>(sockfd, /*should_close=*/true));
 #else
@@ -263,21 +263,21 @@ llvm::Error ConnectToRemote(MainLoop &mainloop,
                   llvm::fmt_consume(std::move(error)));
           }
         },
-        &error);
+        &status);
 
-    if (error.Fail())
+    if (status.Fail())
       return llvm::createStringErrorV(
-          "failed to connect to client at '{0}': {1}", url, error);
+          "failed to connect to client at '{0}': {1}", url, status);
     if (connection_result != eConnectionStatusSuccess)
       return llvm::createStringErrorV(
           "failed to connect to client at '{0}' (connection status: {1})", url,
           static_cast<int>(connection_result));
     connection_up = std::move(conn_fd_up);
   }
-  error = gdb_server.InitializeConnection(std::move(connection_up));
-  if (error.Fail())
+  status = gdb_server.InitializeConnection(std::move(connection_up));
+  if (status.Fail())
     return llvm::createStringErrorV("failed to initialize connection: {0}",
-                                    error);
+                                    status);
   llvm::outs() << "Connection established.\n";
   return llvm::Error::success();
 }
@@ -333,13 +333,13 @@ DESCRIPTION
 } // namespace
 
 int main_gdbserver(int argc, char *argv[]) {
-  Status error;
+  Status status;
   MainLoop mainloop;
 #ifndef _WIN32
   // Setup signal handlers first thing.
   signal(SIGPIPE, SIG_IGN);
   MainLoop::SignalHandleUP sighup_handle =
-      mainloop.RegisterSignal(SIGHUP, sighup_handler, error);
+      mainloop.RegisterSignal(SIGHUP, sighup_handler, status);
 #endif
 
   const char *progname = argv[0];
diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp
index 59b1eb419bc2b..bef0c61532f3c 100644
--- a/lldb/tools/lldb-server/lldb-platform.cpp
+++ b/lldb/tools/lldb-server/lldb-platform.cpp
@@ -206,13 +206,12 @@ static Status parse_listen_host_port(Socket::SocketProtocol &protocol,
 static Status save_socket_id_to_file(const std::string &socket_id,
                                      const FileSpec &file_spec) {
   FileSpec temp_file_spec(file_spec.GetDirectory().GetStringRef());
-  Status error(llvm::sys::fs::create_directory(temp_file_spec.GetPath()));
-  if (error.Fail())
+  Status status(llvm::sys::fs::create_directory(temp_file_spec.GetPath()));
+  if (status.Fail())
     return Status::FromErrorStringWithFormat(
         "Failed to create directory %s: %s", temp_file_spec.GetPath().c_str(),
-        error.AsCString());
+        status.AsCString());
 
-  Status status;
   if (auto Err = llvm::writeToOutput(file_spec.GetPath(),
                                      [&socket_id](llvm::raw_ostream &OS) {
                                        OS << socket_id;
@@ -231,9 +230,9 @@ static Status ListenGdbConnectionsIfNeeded(
     return Status();
 
   gdb_sock = std::make_unique<TCPSocket>(/*should_close=*/true);
-  Status error = gdb_sock->Listen(gdb_address, backlog);
-  if (error.Fail())
-    return error;
+  Status status = gdb_sock->Listen(gdb_address, backlog);
+  if (status.Fail())
+    return status;
 
   if (gdbserver_port == 0)
     gdbserver_port = gdb_sock->GetLocalPortNumber();
@@ -253,24 +252,24 @@ AcceptGdbConnectionsIfNeeded(const FileSpec &debugserver_path,
   return gdb_sock->Accept(main_loop, [debugserver_path, gdbserver_port,
                                       &args](std::unique_ptr<Socket> sock_up) {
     Log *log = GetLog(LLDBLog::Platform);
-    Status error;
-    SharedSocket shared_socket(sock_up.get(), error);
-    if (error.Fail()) {
-      LLDB_LOGF(log, "gdbserver SharedSocket failed: %s", error.AsCString());
+    Status status;
+    SharedSocket shared_socket(sock_up.get(), status);
+    if (status.Fail()) {
+      LLDB_LOGF(log, "gdbserver SharedSocket failed: %s", status.AsCString());
       return;
     }
     lldb::pid_t child_pid = LLDB_INVALID_PROCESS_ID;
     std::string socket_name;
     GDBRemoteCommunicationServerPlatform platform(
         debugserver_path, Socket::ProtocolTcp, gdbserver_port);
-    error = platform.LaunchGDBServer(args, child_pid, socket_name,
-                                     shared_socket.GetSendableFD());
-    if (error.Success() && child_pid != LLDB_INVALID_PROCESS_ID) {
-      error = shared_socket.CompleteSending(child_pid);
-      if (error.Fail()) {
+    status = platform.LaunchGDBServer(args, child_pid, socket_name,
+                                      shared_socket.GetSendableFD());
+    if (status.Success() && child_pid != LLDB_INVALID_PROCESS_ID) {
+      status = shared_socket.CompleteSending(child_pid);
+      if (status.Fail()) {
         Host::Kill(child_pid, SIGTERM);
         LLDB_LOGF(log, "gdbserver CompleteSending failed: %s",
-                  error.AsCString());
+                  status.AsCString());
         return;
       }
     }
@@ -285,19 +284,19 @@ static void client_handle(GDBRemoteCommunicationServerPlatform &platform,
   if (args.GetArgumentCount() > 0) {
     lldb::pid_t pid = LLDB_INVALID_PROCESS_ID;
     std::string socket_name;
-    Status error = platform.LaunchGDBServer(args, pid, socket_name,
-                                            SharedSocket::kInvalidFD);
-    if (error.Success())
+    Status status = platform.LaunchGDBServer(args, pid, socket_name,
+                                             SharedSocket::kInvalidFD);
+    if (status.Success())
       platform.SetPendingGdbServer(socket_name);
     else
-      fprintf(stderr, "failed to start gdbserver: %s\n", error.AsCString());
+      fprintf(stderr, "failed to start gdbserver: %s\n", status.AsCString());
   }
 
   bool interrupt = false;
   bool done = false;
-  Status error;
+  Status status;
   while (!interrupt && !done) {
-    if (platform.GetPacketAndSendResponse(std::nullopt, error, interrupt,
+    if (platform.GetPacketAndSendResponse(std::nullopt, status, interrupt,
                                           done) !=
         GDBRemoteCommunication::PacketResult::Success)
       break;
@@ -312,10 +311,10 @@ static Status spawn_process(const char *progname, const FileSpec &prog,
                             const std::string &log_file,
                             const StringRef log_channels, MainLoop &main_loop,
                             bool multi_client) {
-  Status error;
-  SharedSocket shared_socket(conn_socket, error);
-  if (error.Fail())
-    return error;
+  Status status;
+  SharedSocket shared_socket(conn_socket, status);
+  if (status.Fail())
+    return status;
 
   ProcessLaunchInfo launch_info;
 
@@ -376,9 +375,9 @@ static Status spawn_process(const char *progname, const FileSpec &prog,
   std::string cmd;
   self_args.GetCommandString(cmd);
 
-  error = Host::LaunchProcess(launch_info);
-  if (error.Fail())
-    return error;
+  status = Host::LaunchProcess(launch_info);
+  if (status.Fail())
+    return status;
 
   lldb::pid_t child_pid = launch_info.GetProcessID();
   if (child_pid == LLDB_INVALID_PROCESS_ID)
@@ -387,10 +386,10 @@ static Status spawn_process(const char *progname, const FileSpec &prog,
   LLDB_LOG(GetLog(LLDBLog::Platform), "lldb-platform launched '{0}', pid={1}",
            cmd, child_pid);
 
-  error = shared_socket.CompleteSending(child_pid);
-  if (error.Fail()) {
+  status = shared_socket.CompleteSending(child_pid);
+  if (status.Fail()) {
     Host::Kill(child_pid, SIGTERM);
-    return error;
+    return status;
   }
 
   return Status();
@@ -432,7 +431,7 @@ int main_platform(int argc, char *argv[]) {
     return EXIT_SUCCESS;
   }
 
-  Status error;
+  Status status;
   shared_fd_t fd = SharedSocket::kInvalidFD;
   uint16_t gdbserver_port = 0;
   FileSpec socket_file;
@@ -530,9 +529,9 @@ int main_platform(int argc, char *argv[]) {
   if (fd != SharedSocket::kInvalidFD) {
     // Child process will handle the connection and exit.
     NativeSocket sockfd;
-    error = SharedSocket::GetNativeSocket(fd, sockfd);
-    if (error.Fail()) {
-      LLDB_LOGF(log, "lldb-platform child: %s", error.AsCString());
+    status = SharedSocket::GetNativeSocket(fd, sockfd);
+    if (status.Fail()) {
+      LLDB_LOGF(log, "lldb-platform child: %s", status.AsCString());
       return socket_error;
     }
 
@@ -576,21 +575,21 @@ int main_platform(int argc, char *argv[]) {
   std::string address;
   std::string gdb_address;
   uint16_t platform_port = 0;
-  error = parse_listen_host_port(protocol, listen_host_port, address,
-                                 platform_port, gdb_address, gdbserver_port);
-  if (error.Fail()) {
-    printf("Failed to parse listen address: %s\n", error.AsCString());
+  status = parse_listen_host_port(protocol, listen_host_port, address,
+                                  platform_port, gdb_address, gdbserver_port);
+  if (status.Fail()) {
+    printf("Failed to parse listen address: %s\n", status.AsCString());
     return socket_error;
   }
 
-  std::unique_ptr<Socket> platform_sock = Socket::Create(protocol, error);
-  if (error.Fail()) {
-    printf("Failed to create platform socket: %s\n", error.AsCString());
+  std::unique_ptr<Socket> platform_sock = Socket::Create(protocol, status);
+  if (status.Fail()) {
+    printf("Failed to create platform socket: %s\n", status.AsCString());
     return socket_error;
   }
-  error = platform_sock->Listen(address, backlog);
-  if (error.Fail()) {
-    printf("Failed to listen platform: %s\n", error.AsCString());
+  status = platform_sock->Listen(address, backlog);
+  if (status.Fail()) {
+    printf("Failed to listen platform: %s\n", status.AsCString());
     return socket_error;
   }
   if (protocol == Socket::ProtocolTcp && platform_port == 0)
@@ -598,24 +597,24 @@ int main_platform(int argc, char *argv[]) {
         static_cast<TCPSocket *>(platform_sock.get())->GetLocalPortNumber();
 
   if (socket_file) {
-    error = save_socket_id_to_file(
+    status = save_socket_id_to_file(
         protocol == Socket::ProtocolTcp
             ? (platform_port ? llvm::to_string(platform_port) : "")
             : address,
         socket_file);
-    if (error.Fail()) {
+    if (status.Fail()) {
       fprintf(stderr, "failed to write socket id to %s: %s\n",
-              socket_file.GetPath().c_str(), error.AsCString());
+              socket_file.GetPath().c_str(), status.AsCString());
       return EXIT_FAILURE;
     }
   }
 
   std::unique_ptr<TCPSocket> gdb_sock;
   // Update gdbserver_port if it is still 0 and protocol is tcp.
-  error = ListenGdbConnectionsIfNeeded(protocol, gdb_sock, gdb_address,
-                                       gdbserver_port);
-  if (error.Fail()) {
-    printf("Failed to listen gdb: %s\n", error.AsCString());
+  status = ListenGdbConnectionsIfNeeded(protocol, gdb_sock, gdb_address,
+                                        gdbserver_port);
+  if (status.Fail()) {
+    printf("Failed to listen gdb: %s\n", status.AsCString());
     return socket_error;
   }
 
@@ -627,15 +626,15 @@ int main_platform(int argc, char *argv[]) {
                         log_channels, &main_loop, multi_client,
                         &platform_handles](std::unique_ptr<Socket> sock_up) {
               printf("Connection established.\n");
-              Status error = spawn_process(
+              Status status = spawn_process(
                   progname, HostInfo::GetProgramFileSpec(), sock_up.get(),
                   gdbserver_port, inferior_arguments, log_file, log_channels,
                   main_loop, multi_client);
-              if (error.Fail()) {
+              if (status.Fail()) {
                 Log *log = GetLog(LLDBLog::Platform);
-                LLDB_LOGF(log, "spawn_process failed: %s", error.AsCString());
+                LLDB_LOGF(log, "spawn_process failed: %s", status.AsCString());
                 WithColor::error()
-                    << "spawn_process failed: " << error.AsCString() << "\n";
+                    << "spawn_process failed: " << status.AsCString() << "\n";
                 if (!multi_client)
                   main_loop.RequestTermination();
               }
diff --git a/lldb/unittests/Platform/CMakeLists.txt b/lldb/unittests/Platform/CMakeLists.txt
index a96636fc3fd59..f8755432bf6d7 100644
--- a/lldb/unittests/Platform/CMakeLists.txt
+++ b/lldb/unittests/Platform/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_lldb_unittest(LLDBPlatformTests
+  TestUtils.cpp
   PlatformAppleSimulatorTest.cpp
   PlatformDarwinTest.cpp
   PlatformMacOSXTest.cpp
diff --git a/lldb/unittests/Platform/PlatformDarwinTest.cpp b/lldb/unittests/Platform/PlatformDarwinTest.cpp
index e6d1beb3a75e9..448dcab7070df 100644
--- a/lldb/unittests/Platform/PlatformDarwinTest.cpp
+++ b/lldb/unittests/Platform/PlatformDarwinTest.cpp
@@ -8,6 +8,8 @@
 
 #include "gtest/gtest.h"
 
+#include "TestUtils.h"
+
 #include "Plugins/Platform/MacOSX/PlatformDarwin.h"
 #include "Plugins/Platform/MacOSX/PlatformMacOSX.h"
 #include "Plugins/Platform/MacOSX/PlatformRemoteMacOSX.h"
@@ -16,7 +18,6 @@
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/PluginManager.h"
 #include "lldb/Host/HostInfo.h"
-#include "lldb/Interpreter/ScriptInterpreter.h"
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
@@ -28,54 +29,6 @@
 using namespace lldb;
 using namespace lldb_private;
 
-namespace {
-class MockScriptInterpreterPython : public ScriptInterpreter {
-public:
-  MockScriptInterpreterPython(Debugger &debugger)
-      : ScriptInterpreter(debugger,
-                          lldb::ScriptLanguage::eScriptLanguagePython) {}
-
-  ~MockScriptInterpreterPython() override = default;
-
-  bool ExecuteOneLine(llvm::StringRef command, CommandReturnObject *,
-                      const ExecuteScriptOptions &) override {
-    return false;
-  }
-
-  void ExecuteInterpreterLoop() override {}
-
-  static void Initialize() {
-    PluginManager::RegisterPlugin(GetPluginNameStatic(),
-                                  GetPluginDescriptionStatic(),
-                                  lldb::eScriptLanguagePython, CreateInstance);
-  }
-
-  static void Terminate() { PluginManager::UnregisterPlugin(CreateInstance); }
-
-  bool IsReservedWord(const char *word) override {
-    return llvm::is_contained({"import", "mykeyword_1_1_1"},
-                              llvm::StringRef(word));
-  }
-
-  static lldb::ScriptInterpreterSP CreateInstance(Debugger &debugger) {
-    return std::make_shared<MockScriptInterpreterPython>(debugger);
-  }
-
-  static llvm::StringRef GetPluginNameStatic() {
-    return "MockScriptInterpreterPython";
-  }
-
-  static llvm::StringRef GetPluginDescriptionStatic() {
-    return "MockScriptInterpreterPython";
-  }
-
-  // PluginInterface protocol
-  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
-};
-
-LLDB_PLUGIN_DEFINE(MockScriptInterpreterPython)
-} // namespace
-
 struct PlatformDarwinLocateTest : public testing::Test {
 protected:
   void SetUp() override {
@@ -144,18 +97,6 @@ struct PlatformDarwinLocateTest : public testing::Test {
       subsystems;
 };
 
-static std::string CreateFile(llvm::StringRef filename,
-                              llvm::SmallString<128> parent_dir) {
-  llvm::SmallString<128> path(parent_dir);
-  llvm::sys::path::append(path, filename);
-  int fd;
-  std::error_code ret = llvm::sys::fs::openFileForWrite(path, fd);
-  assert(!ret && "Failed to create test file.");
-  ::close(fd);
-
-  return path.c_str();
-}
-
 TEST(PlatformDarwinTest, TestParseVersionBuildDir) {
   llvm::VersionTuple V;
   llvm::StringRef D;
@@ -340,7 +281,8 @@ TEST_F(PlatformDarwinLocateTest,
 
   // Keywords are not permitted in module names.
   // See MockScriptInterpreterPython::IsReservedWord
-  CreateFile("import.py", m_tmp_dsym_python_dir);
+  FileSpec script_fspec(CreateFile("import.py", m_tmp_dsym_python_dir));
+  ASSERT_TRUE(script_fspec);
 
   StreamString ss;
   FileSpecList fspecs =
@@ -349,16 +291,11 @@ TEST_F(PlatformDarwinLocateTest,
               ss, module_fspec, *m_target_sp, dsym_module_fpec);
   EXPECT_EQ(fspecs.GetSize(), 0u);
 
-  std::string orig_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/import.py").str();
-  std::string fixed_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/_import.py").str();
   std::string expected = llvm::formatv(
-      "warning: the symbol file '{0}' contains a debug script. However, its "
-      "name conflicts with a keyword and as such cannot be loaded. If you "
-      "intend to have this script loaded, please rename '{1}' to '{2}' and "
-      "retry.\n",
-      dsym_module_fpec.GetPath(), orig_script, fixed_script);
+      "debug script '{0}' cannot be loaded because 'import.py' "
+      "conflicts with the keyword 'import'. If you intend to have this script "
+      "loaded, please rename it to '_import.py' and retry.\n",
+      script_fspec.GetPath());
   EXPECT_EQ(ss.GetString(), expected);
 }
 
@@ -379,7 +316,9 @@ TEST_F(PlatformDarwinLocateTest,
   // Keywords are not permitted in module names.
   // See MockScriptInterpreterPython::IsReservedWord
   CreateFile("_import.py", m_tmp_dsym_python_dir);
-  CreateFile("import.py", m_tmp_dsym_python_dir);
+
+  FileSpec orig_fspec(CreateFile("import.py", m_tmp_dsym_python_dir));
+  ASSERT_TRUE(orig_fspec);
 
   StreamString ss;
   FileSpecList fspecs =
@@ -389,16 +328,11 @@ TEST_F(PlatformDarwinLocateTest,
   EXPECT_EQ(fspecs.GetSize(), 1u);
   EXPECT_EQ(fspecs.GetFileSpecAtIndex(0).GetFilename(), "_import.py");
 
-  std::string orig_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/import.py").str();
-  std::string fixed_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/_import.py").str();
   std::string expected = llvm::formatv(
-      "warning: the symbol file '{0}' contains a debug script. However, its "
-      "name '{1}' conflicts with a keyword and as such cannot be loaded. LLDB "
-      "will load '{2}' instead. Consider removing the file with the malformed "
-      "name to eliminate this warning.\n",
-      dsym_module_fpec.GetPath(), orig_script, fixed_script);
+      "debug script '{0}' cannot be loaded because 'import.py' "
+      "conflicts with the keyword 'import'. Ignoring 'import.py' and loading "
+      "'_import.py' instead.\n",
+      orig_fspec.GetPath());
   EXPECT_EQ(ss.GetString(), expected);
 }
 
@@ -447,7 +381,9 @@ TEST_F(
       CreateFile("TestModule-1.1 1.o", m_tmp_dsym_dwarf_dir));
   ASSERT_TRUE(dsym_module_fpec);
 
-  CreateFile("TestModule-1.1 1.py", m_tmp_dsym_python_dir);
+  FileSpec script_fspec(
+      CreateFile("TestModule-1.1 1.py", m_tmp_dsym_python_dir));
+  ASSERT_TRUE(script_fspec);
 
   StreamString ss;
   FileSpecList fspecs =
@@ -456,16 +392,11 @@ TEST_F(
               ss, module_fspec, *m_target_sp, dsym_module_fpec);
   EXPECT_EQ(fspecs.GetSize(), 0u);
 
-  std::string orig_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/TestModule-1.1 1.py").str();
-  std::string fixed_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/TestModule_1_1_1.py").str();
   std::string expected = llvm::formatv(
-      "warning: the symbol file '{0}' contains a debug script. However, its "
-      "name contains reserved characters and as such cannot be loaded. If you "
-      "intend to have this script loaded, please rename '{1}' to '{2}' and "
-      "retry.\n",
-      dsym_module_fpec.GetPath(), orig_script, fixed_script);
+      "debug script '{0}' cannot be loaded because 'TestModule-1.1 1.py' "
+      "contains reserved characters. If you intend to have this script "
+      "loaded, please rename it to 'TestModule_1_1_1.py' and retry.\n",
+      script_fspec.GetPath());
   EXPECT_EQ(ss.GetString(), expected);
 }
 
@@ -486,7 +417,9 @@ TEST_F(
       CreateFile("TestModule-1.1 1.o", m_tmp_dsym_dwarf_dir));
   ASSERT_TRUE(dsym_module_fpec);
 
-  CreateFile("TestModule-1.1 1.py", m_tmp_dsym_python_dir);
+  FileSpec orig_fspec(CreateFile("TestModule-1.1 1.py", m_tmp_dsym_python_dir));
+  ASSERT_TRUE(orig_fspec);
+
   CreateFile("TestModule_1_1_1.py", m_tmp_dsym_python_dir);
 
   StreamString ss;
@@ -497,16 +430,11 @@ TEST_F(
   EXPECT_EQ(fspecs.GetSize(), 1u);
   EXPECT_EQ(fspecs.GetFileSpecAtIndex(0).GetFilename(), "TestModule_1_1_1.py");
 
-  std::string orig_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/TestModule-1.1 1.py").str();
-  std::string fixed_script =
-      (m_tmp_dsym_dwarf_dir + "/../Python/TestModule_1_1_1.py").str();
   std::string expected = llvm::formatv(
-      "warning: the symbol file '{0}' contains a debug script. However, its "
-      "name '{1}' contains reserved characters and as such cannot be loaded. "
-      "LLDB will load '{2}' instead. Consider removing the file with the "
-      "malformed name to eliminate this warning.\n",
-      dsym_module_fpec.GetPath(), orig_script, fixed_script);
+      "debug script '{0}' cannot be loaded because"
+      " 'TestModule-1.1 1.py' contains reserved characters. Ignoring"
+      " 'TestModule-1.1 1.py' and loading 'TestModule_1_1_1.py' instead.\n",
+      orig_fspec.GetPath());
   EXPECT_EQ(ss.GetString(), expected);
 }
 
@@ -564,8 +492,8 @@ TEST_F(
           ->LocateExecutableScriptingResourcesFromDSYM(
               ss, module_fspec, *m_target_sp, dsym_module_fpec);
   EXPECT_EQ(fspecs.GetSize(), 0u);
-  EXPECT_TRUE(ss.GetString().contains(
-      "its name conflicts with a keyword and as such cannot be loaded"));
+  EXPECT_TRUE(
+      ss.GetString().contains("conflicts with the keyword 'mykeyword_1_1_1'"));
 }
 
 TEST_F(
@@ -596,9 +524,8 @@ TEST_F(
               ss, module_fspec, *m_target_sp, dsym_module_fpec);
   EXPECT_EQ(fspecs.GetSize(), 1u);
   EXPECT_EQ(fspecs.GetFileSpecAtIndex(0).GetFilename(), "_mykeyword_1_1_1.py");
-  EXPECT_TRUE(
-      ss.GetString().contains("Consider removing the file with the malformed "
-                              "name to eliminate this warning."));
+  EXPECT_TRUE(ss.GetString().contains("Ignoring 'mykeyword-1.1 1.py' and "
+                                      "loading '_mykeyword_1_1_1.py' instead"));
 }
 
 TEST_F(
@@ -671,8 +598,7 @@ TEST_P(PlatformDarwinLocateWithSpecialCharsTestFixture,
   EXPECT_EQ(fspecs.GetSize(), 0u);
 
   std::string expected =
-      llvm::formatv("please rename '{0}/../Python/{1}' to '{0}/../Python/{2}'",
-                    m_tmp_dsym_dwarf_dir, script_name, recommended_script_name);
+      llvm::formatv("please rename it to '{0}'", recommended_script_name);
   EXPECT_TRUE(ss.GetString().contains(expected));
 }
 
diff --git a/lldb/unittests/Platform/TestUtils.cpp b/lldb/unittests/Platform/TestUtils.cpp
new file mode 100644
index 0000000000000..7330395c803f8
--- /dev/null
+++ b/lldb/unittests/Platform/TestUtils.cpp
@@ -0,0 +1,42 @@
+//===-- TestUtils.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestUtils.h"
+
+#include "lldb/Core/PluginManager.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+MockScriptInterpreterPython::MockScriptInterpreterPython(Debugger &debugger)
+    : ScriptInterpreter(debugger, lldb::ScriptLanguage::eScriptLanguagePython) {
+}
+
+void MockScriptInterpreterPython::Initialize() {
+  PluginManager::RegisterPlugin(GetPluginNameStatic(),
+                                GetPluginDescriptionStatic(),
+                                lldb::eScriptLanguagePython, CreateInstance);
+}
+
+void MockScriptInterpreterPython::Terminate() {
+  PluginManager::UnregisterPlugin(CreateInstance);
+}
+
+LLDB_PLUGIN_DEFINE(MockScriptInterpreterPython)
+
+std::string lldb_private::CreateFile(llvm::StringRef filename,
+                                     llvm::SmallString<128> parent_dir) {
+  llvm::SmallString<128> path(parent_dir);
+  llvm::sys::path::append(path, filename);
+  int fd;
+  std::error_code ret = llvm::sys::fs::openFileForWrite(path, fd);
+  assert(!ret && "Failed to create test file.");
+  ::close(fd);
+
+  return path.c_str();
+}
diff --git a/lldb/unittests/Platform/TestUtils.h b/lldb/unittests/Platform/TestUtils.h
new file mode 100644
index 0000000000000..1cb7b0b20d621
--- /dev/null
+++ b/lldb/unittests/Platform/TestUtils.h
@@ -0,0 +1,59 @@
+//===-- TestUtils.h -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_UNITTESTS_PLATFORM_TESTUTILS
+#define LLDB_UNITTESTS_PLATFORM_TESTUTILS
+
+#include "lldb/Interpreter/ScriptInterpreter.h"
+
+namespace lldb_private {
+class Debugger;
+
+class MockScriptInterpreterPython : public ScriptInterpreter {
+public:
+  MockScriptInterpreterPython(Debugger &debugger);
+  ~MockScriptInterpreterPython() override = default;
+
+  bool ExecuteOneLine(llvm::StringRef command, CommandReturnObject *,
+                      const ExecuteScriptOptions &) override {
+    return false;
+  }
+
+  void ExecuteInterpreterLoop() override {}
+
+  static void Initialize();
+
+  static void Terminate();
+
+  bool IsReservedWord(const char *word) override {
+    return llvm::is_contained({"import", "mykeyword_1_1_1"},
+                              llvm::StringRef(word));
+  }
+
+  static lldb::ScriptInterpreterSP CreateInstance(Debugger &debugger) {
+    return std::make_shared<MockScriptInterpreterPython>(debugger);
+  }
+
+  static llvm::StringRef GetPluginNameStatic() {
+    return "MockScriptInterpreterPython";
+  }
+
+  static llvm::StringRef GetPluginDescriptionStatic() {
+    return "MockScriptInterpreterPython";
+  }
+
+  // PluginInterface protocol
+  llvm::StringRef GetPluginName() override { return GetPluginNameStatic(); }
+};
+
+std::string CreateFile(llvm::StringRef filename,
+                       llvm::SmallString<128> parent_dir);
+
+} // namespace lldb_private
+
+#endif // LLDB_UNITTESTS_PLATFORM_TESTUTILS_H_IN
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 420774b629b8b..6dbe47600cb2f 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -614,6 +614,11 @@ option(LLVM_ENABLE_THREADS "Use threads if available." ON)
 
 set(LLVM_ENABLE_ICU "OFF" CACHE STRING "Use ICU for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
 
+# After Windows 10 1903, icu.lib is available.
+if(WIN32 AND CMAKE_SYSTEM_VERSION VERSION_GREATER_EQUAL "10.0.18362")
+  cmake_dependent_option(LLVM_ENABLE_WINDOWS_ICU "Use Windows vendored ICU if possible" OFF LLVM_ENABLE_ICU OFF)
+endif()
+
 set(LLVM_ENABLE_ICONV "OFF" CACHE STRING "Use iconv for text encoding conversion support if available. Can be ON, OFF, or FORCE_ON")
 
 set(LLVM_ENABLE_ZLIB "ON" CACHE STRING "Use zlib for compression/decompression if available. Can be ON, OFF, or FORCE_ON")
diff --git a/llvm/cmake/config-ix.cmake b/llvm/cmake/config-ix.cmake
index a12eaf76d15df..7a5631b1ae3b5 100644
--- a/llvm/cmake/config-ix.cmake
+++ b/llvm/cmake/config-ix.cmake
@@ -313,15 +313,24 @@ endif()
 if(LLVM_ENABLE_ICU AND NOT(LLVM_ENABLE_ICONV STREQUAL FORCE_ON))
   set(LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
   set(CMAKE_FIND_LIBRARY_SUFFIXES "${CMAKE_SHARED_LIBRARY_SUFFIX}")
-  if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
-    find_package(ICU REQUIRED COMPONENTS uc i18n)
-    if (NOT ICU_FOUND)
-      message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
+  set(HAVE_WINDOWS_ICU OFF)
+  if(LLVM_ENABLE_WINDOWS_ICU AND WIN32 AND CMAKE_SYSTEM_VERSION VERSION_GREATER_EQUAL "10.0.18362")
+    message(STATUS "Use Windows vendored ICU")
+    set(HAVE_WINDOWS_ICU ON)
+  endif()
+  if(NOT HAVE_WINDOWS_ICU)
+    if (LLVM_ENABLE_ICU STREQUAL FORCE_ON)
+      find_package(ICU REQUIRED COMPONENTS uc i18n)
+      if (NOT ICU_FOUND)
+        message(FATAL_ERROR "Failed to configure ICU, but LLVM_ENABLE_ICU is FORCE_ON")
+      endif()
+    else()
+      find_package(ICU COMPONENTS uc i18n)
     endif()
+    set(HAVE_ICU ${ICU_FOUND})
   else()
-    find_package(ICU COMPONENTS uc i18n)
+    set(HAVE_ICU ON)
   endif()
-  set(HAVE_ICU ${ICU_FOUND})
   set(CMAKE_FIND_LIBRARY_SUFFIXES ${LIBRARY_SUFFIXES})
 endif()
 
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 251fe90e3e83a..e2d0f74f26058 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -603,6 +603,11 @@ function(llvm_add_library name)
       message(STATUS "${name} ignored -- Loadable modules not supported on this platform.")
       return()
     endif()
+    # Disable PCH reuse for plugins if PIC is globally disabled, plugins are
+    # always PIC and reusing a non-PIC PCH causes an option mismatch.
+    if(NOT LLVM_ENABLE_PIC)
+      set(ARG_DISABLE_PCH_REUSE TRUE)
+    endif()
   else()
     if(ARG_PLUGIN_TOOL)
       message(WARNING "PLUGIN_TOOL without MODULE doesn't make sense.")
@@ -636,13 +641,16 @@ function(llvm_add_library name)
     # Do add_dependencies(obj) later due to CMake issue 14747.
     list(APPEND objlibs ${obj_name})
 
-    # Bring in the target include directories and link info from our original
-    # target. target_link_libraries propagates transitive dependencies with
-    # proper SYSTEM include handling from IMPORTED targets.
-    # target_include_directories propagates include directories set directly on
-    # the target.
-    target_link_libraries(${obj_name} PRIVATE $<TARGET_PROPERTY:${name},LINK_LIBRARIES>)
-    target_include_directories(${obj_name} PRIVATE $<TARGET_PROPERTY:${name},INCLUDE_DIRECTORIES>)
+    # Propagate include directories from our original target.
+    # TODO: Use $<COMPILE_ONLY:${name}> instead of this manual propagation
+    # when minimum required CMake version is 3.27 or higher.
+    target_include_directories(${obj_name} SYSTEM
+      INTERFACE $<TARGET_PROPERTY:${name},INTERFACE_SYSTEM_INCLUDE_DIRECTORIES>
+      )
+    target_include_directories(${obj_name}
+      INTERFACE $<TARGET_PROPERTY:${name},INTERFACE_INCLUDE_DIRECTORIES>
+      PRIVATE $<TARGET_PROPERTY:${name},INCLUDE_DIRECTORIES>
+      )
 
     set_target_properties(${obj_name} PROPERTIES FOLDER "${subproject_title}/Object Libraries")
     if(ARG_DEPENDS)
diff --git a/llvm/docs/CommandGuide/llvm-link.rst b/llvm/docs/CommandGuide/llvm-link.rst
index 1cc1376becf9d..ba6eee5b71be4 100644
--- a/llvm/docs/CommandGuide/llvm-link.rst
+++ b/llvm/docs/CommandGuide/llvm-link.rst
@@ -71,7 +71,10 @@ OPTIONS
 
 .. option:: --internalize
 
-  Internalize the linked symbols.
+  Maintains existing linkage for symbols defined in the first input and
+  converts linkage in all other inputs to internal linkage unless they are
+  referenced by `llvm*.used` or are special-cased runtime functions (e.g.
+  stack protector runtime).
 
 .. option:: --disable-debug-info-type-map
 
diff --git a/llvm/docs/DirectX/DXILResources.rst b/llvm/docs/DirectX/DXILResources.rst
index d1841dfe0afcb..4471a248108cf 100644
--- a/llvm/docs/DirectX/DXILResources.rst
+++ b/llvm/docs/DirectX/DXILResources.rst
@@ -191,7 +191,7 @@ arguments.
    * - ``%range_size``
      - 3
      - ``i32``
-     - Range size of the binding, where ``UINT32_MAX (~0U)`` denotes an unbounded range.
+     - Range size of the binding, where ``0`` denotes an unbounded range.
    * - ``%index``
      - 4
      - ``i32``
diff --git a/llvm/docs/Frontend/PerformanceTips.rst b/llvm/docs/Frontend/PerformanceTips.rst
index b81df707a31a6..a11c8e3a16857 100644
--- a/llvm/docs/Frontend/PerformanceTips.rst
+++ b/llvm/docs/Frontend/PerformanceTips.rst
@@ -95,6 +95,19 @@ For example, when working with boolean values, store them by zero-extending
 If you do use loads/stores on non-byte-sized types, make sure that you *always*
 use those types. For example, do not first store ``i8`` and then load ``i1``.
 
+Use byte types when manipulating raw memory
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The byte type represents raw memory values in SSA registers. Loads and stores of
+byte types should be used when performing raw memory copies (such as ``memmove``
+and ``memcpy``). Using integer types to represent raw memory introduces type
+punning, which discards the provenance of pointers being copied.
+
+Use a byte type if a value may hold either a pointer or any other type at run
+time (and you don't know which one), or if the value may contain uninitialized
+data. For instance, if a union may hold a pointer or another type, use byte
+types to load and store the value. Otherwise, use the specific type.
+
 Prefer zext over sext when legal
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 8bf66c56f82fc..45a22f416dce1 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -787,8 +787,8 @@ be performed as loads and stores of the correct type since stores of other
 types may not propagate the external data.
 Therefore it is not legal to convert an existing load/store (or a
 ``llvm.memcpy`` / ``llvm.memmove`` intrinsic) of pointer types with external
-state to a load/store of an integer type with the same bitwidth, as that may drop
-the external state.
+state to a load/store of an integer or byte type with the same bitwidth, as that
+may drop the external state.
 
 
 .. _globalvars:
@@ -4471,6 +4471,53 @@ Examples:
 | ``i1942652``   | a really big integer of over 1 million bits.   |
 +----------------+------------------------------------------------+
 
+.. _t_byte:
+
+Byte Type
+"""""""""
+
+:Overview:
+
+The byte type represents raw memory data in SSA registers. It should be used
+when it cannot be determined whether a value holds a pointer or another type at
+run time, or if the value contains uninitialized or poison data. Frontends are
+expected to use a byte type when:
+
+#. Lowering memory operations like `memcpy` and `memmove` to load/store pairs
+   without knowing the underlying type being copied.
+
+#. Working with union types that can hold a pointer alongside a non-pointer
+   type.
+
+#. Working with possibly uninitialized data.
+
+Otherwise, when known, the specific type should be used. Each bit can be:
+
+* An integer bit (0 or 1)
+* Part of a pointer value
+* ``poison``
+
+Any bit width from 1 bit to 2\ :sup:`23`\ (about 8 million) can be specified.
+
+:Syntax:
+
+::
+
+      bN
+
+The number of bits the byte occupies is specified by the ``N`` value.
+
+Examples:
+*********
+
++----------------+------------------------------------------------+
+| ``b1``         | a single-bit byte value.                       |
++----------------+------------------------------------------------+
+| ``b32``        | a 32-bit byte value.                           |
++----------------+------------------------------------------------+
+| ``b128``       | a 128-bit byte value.                          |
++----------------+------------------------------------------------+
+
 .. _t_floating:
 
 Floating-Point Types
@@ -4904,6 +4951,10 @@ Simple Constants
     Note that hexadecimal integers are sign extended from the number
     of active bits, i.e., the bit width minus the number of leading
     zeros. So '``s0x0001``' of type '``i16``' will be -1, not 1.
+**Byte constants**
+    Byte constants are used to initialize global variables of the :ref:`byte
+    <t_byte>` type. These are strictly equivalent to integer constants:
+    ``store b8 42, ptr %p`` is equivalent to ``store i8 42, ptr %p``.
 **Floating-point constants**
     Floating-point constants use standard decimal notation (e.g.
     123.421), exponential notation (e.g., 1.23421e+2), or a more precise
@@ -11774,6 +11825,8 @@ If the value being loaded is of aggregate type, the bytes that correspond to
 padding may be accessed but are ignored, because it is impossible to observe
 padding from the loaded aggregate value.
 If ``<pointer>`` is not a well-defined value, the behavior is undefined.
+The behavior of loading a value of a type that differs from the type used to
+store it is described in the :ref:`bitcast <i_bitcast>` section.
 
 Examples:
 """""""""
@@ -13032,10 +13085,10 @@ The '``bitcast``' instruction takes a value to cast, which must be a
 non-aggregate first class value, and a type to cast it to, which must
 also be a non-aggregate :ref:`first class <t_firstclass>` type. The
 bit sizes of ``value`` and the destination type, ``ty2``, must be
-identical. If the source type is a pointer, the destination type must
-also be a pointer of the same size. This instruction supports bitwise
-conversion of vectors to integers and to vectors of other types (as
-long as they have the same size).
+identical. If the source type is a pointer, the destination type must also be a
+pointer or a byte (vector of bytes) of the same size. This instruction supports
+bitwise conversion of vectors to integers and to vectors of other types (as long
+as they have the same size).
 
 Semantics:
 """"""""""
@@ -13045,15 +13098,44 @@ is always a *no-op cast* because no bits change with this
 conversion. The conversion is done as if the ``value`` had been stored
 to memory and read back as type ``ty2``. Pointer (or vector of
 pointers) types may only be converted to other pointer (or vector of
-pointers) types with the same address space through this instruction.
-To convert pointers to other types, use the :ref:`inttoptr <i_inttoptr>`
-or :ref:`ptrtoint <i_ptrtoint>` instructions first.
+pointers) types with the same address space or byte (or vector of bytes) types
+through this instruction. To convert pointers to other types, use the
+:ref:`inttoptr <i_inttoptr>` or :ref:`ptrtoint <i_ptrtoint>` instructions first.
 
 There is a caveat for bitcasts involving vector types in relation to
 endianness. For example ``bitcast <2 x i8> <value> to i16`` puts element zero
 of the vector in the least significant bits of the i16 for little-endian while
 element zero ends up in the most significant bits for big-endian.
 
+If ``value`` is of the :ref:`byte type <t_byte>`:
+
+* If ``ty2`` is a scalar type:
+
+    * If ``ty2`` is a byte type, the original bits are unchanged.
+
+    * If ``ty2`` is a pointer type:
+
+        * If at least one bit of ``value`` is ``poison``, the result is
+          ``poison``.
+
+        * If all bits of ``value`` are from the same pointer and are correctly
+          ordered (there were no pointer bit swaps), the result is that pointer.
+
+        * Otherwise, the result is a pointer with the address given by the
+          integer value of the input, and without provenance.
+
+    * If ``ty2`` is an integer or floating-point type:
+
+        * If at least one bit of ``value`` is ``poison``, the result is
+          ``poison``.
+
+        * Otherwise, the result is the value encoded by the input bits with
+          their provenance stripped without being exposed.
+
+* If ``ty2`` is a vector type, the input bits get sliced into chunks
+  corresponding to lanes of the output. Each lane is then converted using the
+  rules for scalar types above.
+
 Example:
 """"""""
 
@@ -13064,6 +13146,17 @@ Example:
       %Z = bitcast <2 x i32> %V to i64; ; yields i64: %V (depends on endianness)
       %Z = bitcast <2 x i32*> %V to <2 x i64*> ; yields <2 x i64*>
 
+      ; considering %bi to hold an integer and %bp to hold a pointer,
+      %a = bitcast b64 %bi to i64       ; returns an integer, no-op cast
+      %b = bitcast b64 %bp to i64       ; reinterprets the pointer as an integer, returning its address without exposing provenance
+      %c = bitcast b64 %bp to ptr       ; returns a pointer, no-op cast
+      %d = bitcast b64 %bi to ptr       ; reinterprets the integer as a pointer, returning a pointer with no provenance
+
+      %e = bitcast <2 x b32> %v to i64  ; reinterprets the raw bytes as an integer
+      %f = bitcast <2 x b32> %v to ptr  ; reinterprets the raw bytes as a pointer
+
+      %g = bitcast <2 x b32> %v to <4 x i16> ; reinterprets the raw bytes as integers
+
 .. _i_addrspacecast:
 
 '``addrspacecast .. to``' Instruction
@@ -13495,6 +13588,7 @@ instructions may yield different values.
 While ``undef`` and ``poison`` pointers can be frozen, the result is a
 non-dereferenceable pointer. See the
 :ref:`Pointer Aliasing Rules <pointeraliasing>` section for more information.
+Values of the :ref:`byte type <t_byte>` are frozen on a per-bit basis.
 If an aggregate value or vector is frozen, the operand is frozen element-wise.
 The padding of an aggregate isn't considered, since it isn't visible
 without storing it into memory and loading it with a different type.
@@ -13523,6 +13617,9 @@ Example:
       %d = extractelement <2 x i32> %v.fr, i32 0 ; not undef
       %add.f = add i32 %d, %d                    ; even number
 
+      %l = load b32, ptr %p                      ; may be uninitialized
+      %f = freeze b32 %l                         ; freezes on a per-bit basis
+
       ; branching on frozen value
       %poison = add nsw i1 %k, undef   ; poison
       %c = freeze i1 %poison
@@ -18309,15 +18406,15 @@ support all types however.
 
       declare i32 @llvm.lround.i32.f32(float %Val)
       declare i32 @llvm.lround.i32.f64(double %Val)
-      declare i32 @llvm.lround.i32.f80(float %Val)
-      declare i32 @llvm.lround.i32.f128(double %Val)
-      declare i32 @llvm.lround.i32.ppcf128(double %Val)
+      declare i32 @llvm.lround.i32.f80(x86_fp80 %Val)
+      declare i32 @llvm.lround.i32.f128(fp128 %Val)
+      declare i32 @llvm.lround.i32.ppcf128(ppc_fp128 %Val)
 
       declare i64 @llvm.lround.i64.f32(float %Val)
       declare i64 @llvm.lround.i64.f64(double %Val)
-      declare i64 @llvm.lround.i64.f80(float %Val)
-      declare i64 @llvm.lround.i64.f128(double %Val)
-      declare i64 @llvm.lround.i64.ppcf128(double %Val)
+      declare i64 @llvm.lround.i64.f80(x86_fp80 %Val)
+      declare i64 @llvm.lround.i64.f128(fp128 %Val)
+      declare i64 @llvm.lround.i64.ppcf128(ppc_fp128 %Val)
 
 Overview:
 """""""""
@@ -18353,9 +18450,9 @@ floating-point type. Not all targets support all types however.
 
       declare i64 @llvm.llround.i64.f32(float %Val)
       declare i64 @llvm.llround.i64.f64(double %Val)
-      declare i64 @llvm.llround.i64.f80(float %Val)
-      declare i64 @llvm.llround.i64.f128(double %Val)
-      declare i64 @llvm.llround.i64.ppcf128(double %Val)
+      declare i64 @llvm.llround.i64.f80(x86_fp80 %Val)
+      declare i64 @llvm.llround.i64.f128(fp128 %Val)
+      declare i64 @llvm.llround.i64.ppcf128(ppc_fp128 %Val)
 
 Overview:
 """""""""
@@ -18393,15 +18490,15 @@ support all types however.
 
       declare i32 @llvm.lrint.i32.f32(float %Val)
       declare i32 @llvm.lrint.i32.f64(double %Val)
-      declare i32 @llvm.lrint.i32.f80(float %Val)
-      declare i32 @llvm.lrint.i32.f128(double %Val)
-      declare i32 @llvm.lrint.i32.ppcf128(double %Val)
+      declare i32 @llvm.lrint.i32.f80(x86_fp80 %Val)
+      declare i32 @llvm.lrint.i32.f128(fp128 %Val)
+      declare i32 @llvm.lrint.i32.ppcf128(ppc_fp128 %Val)
 
       declare i64 @llvm.lrint.i64.f32(float %Val)
       declare i64 @llvm.lrint.i64.f64(double %Val)
-      declare i64 @llvm.lrint.i64.f80(float %Val)
-      declare i64 @llvm.lrint.i64.f128(double %Val)
-      declare i64 @llvm.lrint.i64.ppcf128(double %Val)
+      declare i64 @llvm.lrint.i64.f80(x86_fp80 %Val)
+      declare i64 @llvm.lrint.i64.f128(fp128 %Val)
+      declare i64 @llvm.lrint.i64.ppcf128(ppc_fp128 %Val)
 
 Overview:
 """""""""
@@ -18440,9 +18537,9 @@ support all types however.
 
       declare i64 @llvm.llrint.i64.f32(float %Val)
       declare i64 @llvm.llrint.i64.f64(double %Val)
-      declare i64 @llvm.llrint.i64.f80(float %Val)
-      declare i64 @llvm.llrint.i64.f128(double %Val)
-      declare i64 @llvm.llrint.i64.ppcf128(double %Val)
+      declare i64 @llvm.llrint.i64.f80(x86_fp80 %Val)
+      declare i64 @llvm.llrint.i64.f128(fp128 %Val)
+      declare i64 @llvm.llrint.i64.ppcf128(ppc_fp128 %Val)
 
 Overview:
 """""""""
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 2053e087ba23e..e685660195bcd 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -86,6 +86,10 @@ Changes to LLVM infrastructure
 * The ``Br`` opcode was split into two opcodes separating unconditional
   (``UncondBr``) and conditional (``CondBr``) branches.
 
+* The operand order of ``CondBr`` instructions was adjusted to match the
+  successor order. This can cause subtle breakage when using ``getOperand`` or
+  ``setOperand`` to access successors.
+
 Changes to building LLVM
 ------------------------
 
@@ -162,6 +166,7 @@ Changes to the RISC-V Backend
 * `-mcpu=xt-c910v2` and `-mcpu=xt-c920v2` were added.
 * Adds experimental assembler support for the 'Zvzip` (RISC-V Vector
   Reordering Structured Data) extension.
+* `-mcpu=sifive-x160` and `-mcpu=sifive-x180` were added.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -190,6 +195,10 @@ Changes to the C API
 
 * Replaced opcode ``LLVMBr`` with ``LLVMUncondBr`` and ``LLVMCondBr``.
 
+* The operand order of ``CondBr`` instructions was adjusted to match the
+  successor order. This can cause subtle breakage when using ``LLVMGetOperand``
+  or ``LLVMSetOperand`` to access successors.
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/docs/SPIRVUsage.rst b/llvm/docs/SPIRVUsage.rst
index 1159b4b908078..9a4478f57e802 100644
--- a/llvm/docs/SPIRVUsage.rst
+++ b/llvm/docs/SPIRVUsage.rst
@@ -633,6 +633,15 @@ LLVM IR representations.
 +--------------------+---------------------------------------------------------+
 | SPIR-V instruction | LLVM IR                                                 |
 +====================+=========================================================+
+| OpMemoryModel      | .. code-block:: llvm                                    |
+|                    |                                                         |
+|                    |    !spirv.MemoryModel = !{!0}                           |
+|                    |    !0 = !{i32 0, i32 1}                                 |
+|                    |    ; Set addressing model to Logical (0) and memory     |
+|                    |    ; model to GLSL450 (1). Valid memory models:         |
+|                    |    ; Simple (0), GLSL450 (1), OpenCL (2),               |
+|                    |    ; VulkanKHR (3).                                     |
++--------------------+---------------------------------------------------------+
 | OpExecutionMode    | .. code-block:: llvm                                    |
 |                    |                                                         |
 |                    |    !spirv.ExecutionMode = !{!0}                         |
diff --git a/llvm/docs/TestSuiteGuide.md b/llvm/docs/TestSuiteGuide.md
index 671d6c3b3372a..cdd5ae72204ab 100644
--- a/llvm/docs/TestSuiteGuide.md
+++ b/llvm/docs/TestSuiteGuide.md
@@ -92,7 +92,7 @@ MicroBenchmarks/XRay microbenchmarks, you need to add `compiler-rt` to your
   with the above `llvm-lit` command. In this case, the `results.json` file will
   contain compile time metrics only (code size, llvm stats and so on).
 
-  This mode is enabled by settting `-DTEST_SUITE_RUN_BENCHMARKS=OFF`,
+  This mode is enabled by setting `-DTEST_SUITE_RUN_BENCHMARKS=OFF`,
   more details [here](common_configuration_options).
 ```
 
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 869031d9e1c73..9dba8a905753a 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -171,6 +171,7 @@ typedef enum {
   LLVMBFloatTypeKind = 18,         /**< 16 bit brain floating point type */
   LLVMX86_AMXTypeKind = 19,        /**< X86 AMX */
   LLVMTargetExtTypeKind = 20,      /**< Target extension type */
+  LLVMByteTypeKind = 21,           /**< Arbitrary bit width bytes */
 } LLVMTypeKind;
 
 typedef enum {
@@ -280,6 +281,7 @@ typedef enum {
   LLVMConstantDataArrayValueKind,
   LLVMConstantDataVectorValueKind,
   LLVMConstantIntValueKind,
+  LLVMConstantByteValueKind,
   LLVMConstantFPValueKind,
   LLVMConstantPointerNullValueKind,
   LLVMConstantTokenNoneValueKind,
@@ -1347,6 +1349,7 @@ LLVM_C_ABI void LLVMSetModuleInlineAsm(LLVMModuleRef M, const char *Asm);
  *
  *   types:
  *     integer type
+ *     byte type
  *     real type
  *     function type
  *     sequence types:
@@ -1398,6 +1401,25 @@ LLVM_C_ABI void LLVMDumpType(LLVMTypeRef Val);
  */
 LLVM_C_ABI char *LLVMPrintTypeToString(LLVMTypeRef Val);
 
+/**
+ * @}
+ */
+
+/**
+ * @defgroup LLVMCCoreTypeByte Byte Types
+ *
+ * Functions in this section operate on byte types.
+ *
+ * @{
+ */
+
+/**
+ * Obtain a byte type from a context with specified bit width.
+ */
+LLVM_C_ABI LLVMTypeRef LLVMByteTypeInContext(LLVMContextRef C,
+                                             unsigned NumBits);
+LLVM_C_ABI unsigned LLVMGetByteTypeWidth(LLVMTypeRef ByteTy);
+
 /**
  * @defgroup LLVMCCoreTypeInt Integer Types
  *
@@ -2014,6 +2036,7 @@ LLVM_C_ABI unsigned LLVMGetTargetExtTypeIntParam(LLVMTypeRef TargetExtTy,
       macro(ConstantExpr)                   \
       macro(ConstantFP)                     \
       macro(ConstantInt)                    \
+      macro(ConstantByte)                   \
       macro(ConstantPointerNull)            \
       macro(ConstantStruct)                 \
       macro(ConstantTokenNone)              \
@@ -2427,6 +2450,36 @@ LLVM_C_ABI LLVMValueRef LLVMConstIntOfStringAndSize(LLVMTypeRef IntTy,
                                                     unsigned SLen,
                                                     uint8_t Radix);
 
+/**
+ * Obtain a constant value for a byte type.
+ *
+ * The returned value corresponds to a llvm::ConstantByte.
+ *
+ * @see llvm::ConstantByte::get()
+ *
+ * @param ByteTy Byte type to obtain value of.
+ * @param N The value the returned instance should refer to.
+ */
+LLVM_C_ABI LLVMValueRef LLVMConstByte(LLVMTypeRef ByteTy, unsigned long long N);
+
+/**
+ * Obtain a constant value for a byte of arbitrary precision.
+ *
+ * @see llvm::ConstantByte::get()
+ */
+LLVM_C_ABI LLVMValueRef LLVMConstByteOfArbitraryPrecision(
+    LLVMTypeRef ByteTy, unsigned NumWords, const uint64_t Words[]);
+
+/**
+ * Obtain a constant value for a byte parsed from a string with specified
+ * length.
+ * @see llvm::ConstantByte::get()
+ */
+LLVM_C_ABI LLVMValueRef LLVMConstByteOfStringAndSize(LLVMTypeRef ByteTy,
+                                                     const char *Text,
+                                                     size_t SLen,
+                                                     uint8_t Radix);
+
 /**
  * Obtain a constant value referring to a double floating point value.
  */
@@ -2471,6 +2524,21 @@ LLVMConstIntGetZExtValue(LLVMValueRef ConstantVal);
  */
 LLVM_C_ABI long long LLVMConstIntGetSExtValue(LLVMValueRef ConstantVal);
 
+/**
+ * Obtain the zero extended value for a byte constant value.
+ *
+ * @see llvm::ConstantByte::getZExtValue()
+ */
+LLVM_C_ABI unsigned long long
+LLVMConstByteGetZExtValue(LLVMValueRef ConstantVal);
+
+/**
+ * Obtain the sign extended value for a byte constant value.
+ *
+ * @see llvm::ConstantByte::getSExtValue()
+ */
+LLVM_C_ABI long long LLVMConstByteGetSExtValue(LLVMValueRef ConstantVal);
+
 /**
  * Obtain the double value for an floating point constant value.
  * losesInfo indicates if some precision was lost in the conversion.
diff --git a/llvm/include/llvm/ADT/Repeated.h b/llvm/include/llvm/ADT/Repeated.h
new file mode 100644
index 0000000000000..f821f3f1f73ca
--- /dev/null
+++ b/llvm/include/llvm/ADT/Repeated.h
@@ -0,0 +1,120 @@
+//===- llvm/ADT/Repeated.h - Repeated value range ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the Repeated<T> class, a memory-efficient range representing N
+// copies of the same value.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_REPEATED_H
+#define LLVM_ADT_REPEATED_H
+
+#include "llvm/ADT/iterator.h"
+
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
+namespace llvm {
+
+/// A random-access iterator that always dereferences to the same value.
+template <typename T>
+class RepeatedIterator
+    : public iterator_facade_base<RepeatedIterator<T>,
+                                  std::random_access_iterator_tag, T, ptrdiff_t,
+                                  const T *, const T &> {
+  const T *value = nullptr;
+  ptrdiff_t index = 0;
+
+public:
+  RepeatedIterator() = default;
+  RepeatedIterator(const T *value, ptrdiff_t index)
+      : value(value), index(index) {}
+
+  const T &operator*() const { return *value; }
+
+  bool operator==(const RepeatedIterator &rhs) const {
+    assert((!value || !rhs.value || value == rhs.value) &&
+           "comparing iterators from different Repeated ranges");
+    return index == rhs.index;
+  }
+
+  bool operator<(const RepeatedIterator &rhs) const {
+    assert((!value || !rhs.value || value == rhs.value) &&
+           "comparing iterators from different Repeated ranges");
+    return index < rhs.index;
+  }
+
+  ptrdiff_t operator-(const RepeatedIterator &rhs) const {
+    assert((!value || !rhs.value || value == rhs.value) &&
+           "subtracting iterators from different Repeated ranges");
+    return index - rhs.index;
+  }
+
+  RepeatedIterator &operator+=(ptrdiff_t n) {
+    index += n;
+    return *this;
+  }
+
+  RepeatedIterator &operator-=(ptrdiff_t n) {
+    index -= n;
+    return *this;
+  }
+};
+
+/// A memory-efficient immutable range with a single value repeated N times.
+/// The value is owned by the range.
+///
+/// `Repeated<T>` is also a proper random-access range: `begin()`/`end()`
+/// return iterators that always dereference to the same stored value.
+template <typename T> struct [[nodiscard]] Repeated {
+  /// Wrapper for the stored value used as a PointerUnion target in range
+  /// types (e.g., TypeRange, ValueRange).
+  struct Storage {
+    T value;
+  };
+
+  Storage storage;
+  size_t count;
+
+  /// Create a `value` repeated `count` times.
+  /// Uses the same argument order like STD container constructors.
+  template <typename U>
+  Repeated(size_t count, U &&value)
+      : storage{std::forward<U>(value)}, count(count) {}
+
+  using iterator = RepeatedIterator<T>;
+  using const_iterator = iterator;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = reverse_iterator;
+  using value_type = T;
+  using size_type = size_t;
+
+  iterator begin() const { return {&storage.value, 0}; }
+  iterator end() const {
+    return {&storage.value, static_cast<ptrdiff_t>(count)};
+  }
+  reverse_iterator rbegin() const { return reverse_iterator(end()); }
+  reverse_iterator rend() const { return reverse_iterator(begin()); }
+
+  size_t size() const { return count; }
+  bool empty() const { return count == 0; }
+
+  const T &value() const { return storage.value; }
+  const T &operator[](size_t idx) const {
+    assert(idx < size() && "Out of bounds");
+    (void)idx;
+    return storage.value;
+  }
+};
+
+template <typename U> Repeated(size_t, U &&) -> Repeated<std::decay_t<U>>;
+
+} // namespace llvm
+
+#endif // LLVM_ADT_REPEATED_H
diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index cf57f2c95b55b..4941ce722e7ff 100644
--- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -13,13 +13,9 @@
 #ifndef LLVM_ANALYSIS_BRANCHPROBABILITYINFO_H
 #define LLVM_ANALYSIS_BRANCHPROBABILITYINFO_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/Compiler.h"
@@ -121,32 +117,9 @@ class BranchProbabilityInfo {
     calculate(F, LI, TLI, DT, PDT);
   }
 
-  BranchProbabilityInfo(BranchProbabilityInfo &&Arg)
-      : Handles(std::move(Arg.Handles)), Probs(std::move(Arg.Probs)),
-        LastF(Arg.LastF),
-        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {
-    for (auto &Handle : Handles)
-      Handle.setBPI(this);
-  }
-
-  BranchProbabilityInfo(const BranchProbabilityInfo &) = delete;
-  BranchProbabilityInfo &operator=(const BranchProbabilityInfo &) = delete;
-
-  BranchProbabilityInfo &operator=(BranchProbabilityInfo &&RHS) {
-    releaseMemory();
-    Handles = std::move(RHS.Handles);
-    Probs = std::move(RHS.Probs);
-    EstimatedBlockWeight = std::move(RHS.EstimatedBlockWeight);
-    for (auto &Handle : Handles)
-      Handle.setBPI(this);
-    return *this;
-  }
-
   LLVM_ABI bool invalidate(Function &, const PreservedAnalyses &PA,
                            FunctionAnalysisManager::Invalidator &);
 
-  LLVM_ABI void releaseMemory();
-
   LLVM_ABI void print(raw_ostream &OS) const;
 
   /// Get an edge's probability, relative to other out-edges of the Src.
@@ -182,7 +155,6 @@ class BranchProbabilityInfo {
                                              const BasicBlock *Src,
                                              const BasicBlock *Dst) const;
 
-public:
   /// Set the raw probabilities for all edges from the given block.
   ///
   /// This allows a pass to explicitly set edge probabilities for a block. It
@@ -213,214 +185,18 @@ class BranchProbabilityInfo {
   /// Forget analysis results for the given basic block.
   LLVM_ABI void eraseBlock(const BasicBlock *BB);
 
-  // Data structure to track SCCs for handling irreducible loops.
-  class SccInfo {
-    // Enum of types to classify basic blocks in SCC. Basic block belonging to
-    // SCC is 'Inner' until it is either 'Header' or 'Exiting'. Note that a
-    // basic block can be 'Header' and 'Exiting' at the same time.
-    enum SccBlockType {
-      Inner = 0x0,
-      Header = 0x1,
-      Exiting = 0x2,
-    };
-    // Map of basic blocks to SCC IDs they belong to. If basic block doesn't
-    // belong to any SCC it is not in the map.
-    using SccMap = DenseMap<const BasicBlock *, int>;
-    // Each basic block in SCC is attributed with one or several types from
-    // SccBlockType. Map value has uint32_t type (instead of SccBlockType)
-    // since basic block may be for example "Header" and "Exiting" at the same
-    // time and we need to be able to keep more than one value from
-    // SccBlockType.
-    using SccBlockTypeMap = DenseMap<const BasicBlock *, uint32_t>;
-    // Vector containing classification of basic blocks for all  SCCs where i'th
-    // vector element corresponds to SCC with ID equal to i.
-    using SccBlockTypeMaps = std::vector<SccBlockTypeMap>;
-
-    SccMap SccNums;
-    SccBlockTypeMaps SccBlocks;
-
-  public:
-    LLVM_ABI explicit SccInfo(const Function &F);
-
-    /// If \p BB belongs to some SCC then ID of that SCC is returned, otherwise
-    /// -1 is returned. If \p BB belongs to more than one SCC at the same time
-    /// result is undefined.
-    LLVM_ABI int getSCCNum(const BasicBlock *BB) const;
-    /// Returns true if \p BB is a 'header' block in SCC with \p SccNum ID,
-    /// false otherwise.
-    bool isSCCHeader(const BasicBlock *BB, int SccNum) const {
-      return getSccBlockType(BB, SccNum) & Header;
-    }
-    /// Returns true if \p BB is an 'exiting' block in SCC with \p SccNum ID,
-    /// false otherwise.
-    bool isSCCExitingBlock(const BasicBlock *BB, int SccNum) const {
-      return getSccBlockType(BB, SccNum) & Exiting;
-    }
-    /// Fills in \p Enters vector with all such blocks that don't belong to
-    /// SCC with \p SccNum ID but there is an edge to a block belonging to the
-    /// SCC.
-    LLVM_ABI void
-    getSccEnterBlocks(int SccNum, SmallVectorImpl<BasicBlock *> &Enters) const;
-    /// Fills in \p Exits vector with all such blocks that don't belong to
-    /// SCC with \p SccNum ID but there is an edge from a block belonging to the
-    /// SCC.
-    LLVM_ABI void getSccExitBlocks(int SccNum,
-                                   SmallVectorImpl<BasicBlock *> &Exits) const;
-
-  private:
-    /// Returns \p BB's type according to classification given by SccBlockType
-    /// enum. Please note that \p BB must belong to SSC with \p SccNum ID.
-    LLVM_ABI uint32_t getSccBlockType(const BasicBlock *BB, int SccNum) const;
-    /// Calculates \p BB's type and stores it in internal data structures for
-    /// future use. Please note that \p BB must belong to SSC with \p SccNum ID.
-    void calculateSccBlockType(const BasicBlock *BB, int SccNum);
-  };
-
 private:
-  // We need to store CallbackVH's in order to correctly handle basic block
-  // removal.
-  class BasicBlockCallbackVH final : public CallbackVH {
-    BranchProbabilityInfo *BPI;
-
-    void deleted() override {
-      assert(BPI != nullptr);
-      BPI->eraseBlock(cast<BasicBlock>(getValPtr()));
-    }
-
-  public:
-    void setBPI(BranchProbabilityInfo *BPI) { this->BPI = BPI; }
-
-    BasicBlockCallbackVH(const Value *V, BranchProbabilityInfo *BPI = nullptr)
-        : CallbackVH(const_cast<Value *>(V)), BPI(BPI) {}
-  };
-
-  /// Pair of Loop and SCC ID number. Used to unify handling of normal and
-  /// SCC based loop representations.
-  using LoopData = std::pair<Loop *, int>;
-  /// Helper class to keep basic block along with its loop data information.
-  class LoopBlock {
-  public:
-    LLVM_ABI explicit LoopBlock(const BasicBlock *BB, const LoopInfo &LI,
-                                const SccInfo &SccI);
-
-    const BasicBlock *getBlock() const { return BB; }
-    BasicBlock *getBlock() { return const_cast<BasicBlock *>(BB); }
-    LoopData getLoopData() const { return LD; }
-    Loop *getLoop() const { return LD.first; }
-    int getSccNum() const { return LD.second; }
-
-    bool belongsToLoop() const { return getLoop() || getSccNum() != -1; }
-    bool belongsToSameLoop(const LoopBlock &LB) const {
-      return (LB.getLoop() && getLoop() == LB.getLoop()) ||
-             (LB.getSccNum() != -1 && getSccNum() == LB.getSccNum());
-    }
-
-  private:
-    const BasicBlock *const BB = nullptr;
-    LoopData LD = {nullptr, -1};
-  };
-
-  // Pair of LoopBlocks representing an edge from first to second block.
-  using LoopEdge = std::pair<const LoopBlock &, const LoopBlock &>;
-
-  DenseSet<BasicBlockCallbackVH, DenseMapInfo<Value*>> Handles;
-
-  // Since we allow duplicate edges from one basic block to another, we use
-  // a pair (PredBlock and an index in the successors) to specify an edge.
-  using Edge = std::pair<const BasicBlock *, unsigned>;
-
-  DenseMap<Edge, BranchProbability> Probs;
+  MutableArrayRef<BranchProbability> allocEdges(const BasicBlock *BB);
+  ArrayRef<BranchProbability> getEdges(const BasicBlock *BB) const;
+
+  // Storage for branch probabilities.
+  SmallVector<BranchProbability> Probs;
+  // Map from block number to first edge.
+  SmallVector<unsigned> EdgeStarts;
 
   /// Track the last function we run over for printing.
   const Function *LastF = nullptr;
-
-  const LoopInfo *LI = nullptr;
-
-  /// Keeps information about all SCCs in a function.
-  std::unique_ptr<const SccInfo> SccI;
-
-  /// Keeps mapping of a basic block to its estimated weight.
-  SmallDenseMap<const BasicBlock *, uint32_t> EstimatedBlockWeight;
-
-  /// Keeps mapping of a loop to estimated weight to enter the loop.
-  SmallDenseMap<LoopData, uint32_t> EstimatedLoopWeight;
-
-  /// Helper to construct LoopBlock for \p BB.
-  LoopBlock getLoopBlock(const BasicBlock *BB) const {
-    return LoopBlock(BB, *LI, *SccI);
-  }
-
-  /// Returns true if destination block belongs to some loop and source block is
-  /// either doesn't belong to any loop or belongs to a loop which is not inner
-  /// relative to the destination block.
-  bool isLoopEnteringEdge(const LoopEdge &Edge) const;
-  /// Returns true if source block belongs to some loop and destination block is
-  /// either doesn't belong to any loop or belongs to a loop which is not inner
-  /// relative to the source block.
-  bool isLoopExitingEdge(const LoopEdge &Edge) const;
-  /// Returns true if \p Edge is either enters to or exits from some loop, false
-  /// in all other cases.
-  bool isLoopEnteringExitingEdge(const LoopEdge &Edge) const;
-  /// Returns true if source and destination blocks belongs to the same loop and
-  /// destination block is loop header.
-  bool isLoopBackEdge(const LoopEdge &Edge) const;
-  // Fills in \p Enters vector with all "enter" blocks to a loop \LB belongs to.
-  void getLoopEnterBlocks(const LoopBlock &LB,
-                          SmallVectorImpl<BasicBlock *> &Enters) const;
-  // Fills in \p Exits vector with all "exit" blocks from a loop \LB belongs to.
-  void getLoopExitBlocks(const LoopBlock &LB,
-                         SmallVectorImpl<BasicBlock *> &Exits) const;
-
-  /// Returns estimated weight for \p BB. std::nullopt if \p BB has no estimated
-  /// weight.
-  std::optional<uint32_t> getEstimatedBlockWeight(const BasicBlock *BB) const;
-
-  /// Returns estimated weight to enter \p L. In other words it is weight of
-  /// loop's header block not scaled by trip count. Returns std::nullopt if \p L
-  /// has no no estimated weight.
-  std::optional<uint32_t> getEstimatedLoopWeight(const LoopData &L) const;
-
-  /// Return estimated weight for \p Edge. Returns std::nullopt if estimated
-  /// weight is unknown.
-  std::optional<uint32_t> getEstimatedEdgeWeight(const LoopEdge &Edge) const;
-
-  /// Iterates over all edges leading from \p SrcBB to \p Successors and
-  /// returns maximum of all estimated weights. If at least one edge has unknown
-  /// estimated weight std::nullopt is returned.
-  template <class IterT>
-  std::optional<uint32_t>
-  getMaxEstimatedEdgeWeight(const LoopBlock &SrcBB,
-                            iterator_range<IterT> Successors) const;
-
-  /// If \p LoopBB has no estimated weight then set it to \p BBWeight and
-  /// return true. Otherwise \p BB's weight remains unchanged and false is
-  /// returned. In addition all blocks/loops that might need their weight to be
-  /// re-estimated are put into BlockWorkList/LoopWorkList.
-  bool updateEstimatedBlockWeight(LoopBlock &LoopBB, uint32_t BBWeight,
-                                  SmallVectorImpl<BasicBlock *> &BlockWorkList,
-                                  SmallVectorImpl<LoopBlock> &LoopWorkList);
-
-  /// Starting from \p LoopBB (including \p LoopBB itself) propagate \p BBWeight
-  /// up the domination tree.
-  void propagateEstimatedBlockWeight(const LoopBlock &LoopBB, DominatorTree *DT,
-                                     PostDominatorTree *PDT, uint32_t BBWeight,
-                                     SmallVectorImpl<BasicBlock *> &WorkList,
-                                     SmallVectorImpl<LoopBlock> &LoopWorkList);
-
-  /// Returns block's weight encoded in the IR.
-  std::optional<uint32_t> getInitialEstimatedBlockWeight(const BasicBlock *BB);
-
-  // Computes estimated weights for all blocks in \p F.
-  void estimateBlockWeights(const Function &F, DominatorTree *DT,
-                            PostDominatorTree *PDT);
-
-  /// Based on computed weights by \p computeEstimatedBlockWeight set
-  /// probabilities on branches.
-  bool calcEstimatedHeuristics(const BasicBlock *BB);
-  bool calcMetadataWeights(const BasicBlock *BB);
-  bool calcPointerHeuristics(const BasicBlock *BB);
-  bool calcZeroHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI);
-  bool calcFloatingPointHeuristics(const BasicBlock *BB);
+  unsigned BlockNumberEpoch;
 };
 
 /// Analysis pass which computes \c BranchProbabilityInfo.
@@ -465,7 +241,6 @@ class LLVM_ABI BranchProbabilityInfoWrapperPass : public FunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnFunction(Function &F) override;
-  void releaseMemory() override;
   void print(raw_ostream &OS, const Module *M = nullptr) const override;
 };
 
diff --git a/llvm/include/llvm/Analysis/CFGPrinter.h b/llvm/include/llvm/Analysis/CFGPrinter.h
index 5ff6fe10c77db..c48d2e5887919 100644
--- a/llvm/include/llvm/Analysis/CFGPrinter.h
+++ b/llvm/include/llvm/Analysis/CFGPrinter.h
@@ -240,7 +240,7 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
 
     // Label source of switch edges with the associated value.
     if (const SwitchInst *SI = dyn_cast<SwitchInst>(Node->getTerminator())) {
-      unsigned SuccNo = I.getSuccessorIndex();
+      unsigned SuccNo = std::distance(succ_begin(SI), I);
 
       if (SuccNo == 0)
         return "def";
@@ -272,8 +272,8 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
     if (!CFGInfo->showEdgeWeights())
       return "";
 
-    unsigned OpNo = I.getSuccessorIndex();
     const Instruction *TI = Node->getTerminator();
+    unsigned OpNo = std::distance(succ_begin(TI), I);
     BasicBlock *SuccBB = TI->getSuccessor(OpNo);
     auto BranchProb = CFGInfo->getBPI()->getEdgeProbability(Node, SuccBB);
     double WeightPercent = ((double)BranchProb.getNumerator()) /
@@ -310,7 +310,7 @@ struct DOTGraphTraits<DOTFuncInfo *> : public DefaultDOTGraphTraits {
     if (!WeightsNode)
       return TTAttr;
 
-    OpNo = I.getSuccessorIndex() + 1;
+    OpNo += 1;
     if (OpNo >= WeightsNode->getNumOperands())
       return TTAttr;
     ConstantInt *Weight =
diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h
index f38de81d76a2c..dc54dfa64b539 100644
--- a/llvm/include/llvm/Analysis/DXILResource.h
+++ b/llvm/include/llvm/Analysis/DXILResource.h
@@ -384,13 +384,18 @@ class ResourceInfo {
       return !(*this == RHS);
     }
     bool operator<(const ResourceBinding &RHS) const {
-      return std::tie(RecordID, Space, LowerBound, Size) <
-             std::tie(RHS.RecordID, RHS.Space, RHS.LowerBound, RHS.Size);
+      // a size of 0 indicates unbounded. Accounting for when the size is 0
+      // guarantees a well ordered results.
+      const bool LHSIsUnbounded = Size == 0;
+      const bool RHSIsUnbounded = RHS.Size == 0;
+      return std::tie(RecordID, Space, LowerBound, LHSIsUnbounded, Size) <
+             std::tie(RHS.RecordID, RHS.Space, RHS.LowerBound, RHSIsUnbounded,
+                      RHS.Size);
     }
     bool overlapsWith(const ResourceBinding &RHS) const {
       if (Space != RHS.Space)
         return false;
-      if (Size == UINT32_MAX)
+      if (Size == 0)
         return LowerBound < RHS.LowerBound;
       return LowerBound + Size - 1 >= RHS.LowerBound;
     }
diff --git a/llvm/include/llvm/Analysis/DependenceAnalysis.h b/llvm/include/llvm/Analysis/DependenceAnalysis.h
index 26589571a65e2..cab9872652073 100644
--- a/llvm/include/llvm/Analysis/DependenceAnalysis.h
+++ b/llvm/include/llvm/Analysis/DependenceAnalysis.h
@@ -541,9 +541,7 @@ class DependenceInfo {
   /// Returns true if any possible dependence is disproved.
   /// If there might be a dependence, returns false.
   /// Sets appropriate direction entry.
-  bool exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
-                    const SCEV *SrcConst, const SCEV *DstConst,
-                    const Loop *CurrentSrcLoop, const Loop *CurrentDstLoop,
+  bool exactSIVtest(const SCEVAddRecExpr *Src, const SCEVAddRecExpr *Dst,
                     unsigned Level, FullDependence &Result) const;
 
   /// weakZeroSrcSIVtest - Tests the weak-zero SIV subscript pair
diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 922c65d5c8340..968873c918393 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -292,6 +292,7 @@ class Vocabulary {
     VectorTy,
     TokenTy,
     IntegerTy,
+    ByteTy,
     FunctionTy,
     PointerTy,
     StructTy,
@@ -465,9 +466,9 @@ class Vocabulary {
 
   /// String mappings for CanonicalTypeID values
   static constexpr StringLiteral CanonicalTypeNames[] = {
-      "FloatTy",   "VoidTy",   "LabelTy",   "MetadataTy",
-      "VectorTy",  "TokenTy",  "IntegerTy", "FunctionTy",
-      "PointerTy", "StructTy", "ArrayTy",   "UnknownTy"};
+      "FloatTy",  "VoidTy",    "LabelTy",  "MetadataTy", "VectorTy",
+      "TokenTy",  "IntegerTy", "ByteTy",   "FunctionTy", "PointerTy",
+      "StructTy", "ArrayTy",   "UnknownTy"};
   static_assert(std::size(CanonicalTypeNames) ==
                     static_cast<unsigned>(CanonicalTypeID::MaxCanonicalType),
                 "CanonicalTypeNames array size must match MaxCanonicalType");
@@ -495,6 +496,7 @@ class Vocabulary {
       CanonicalTypeID::VectorTy,   // X86_AMXTyID
       CanonicalTypeID::TokenTy,    // TokenTyID
       CanonicalTypeID::IntegerTy,  // IntegerTyID
+      CanonicalTypeID::ByteTy,     // ByteTyID
       CanonicalTypeID::FunctionTy, // FunctionTyID
       CanonicalTypeID::PointerTy,  // PointerTyID
       CanonicalTypeID::StructTy,   // StructTyID
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index bcf596a0d79b2..4ac3029175d0d 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -177,6 +177,8 @@ enum TypeCodes {
   TYPE_CODE_OPAQUE_POINTER = 25, // OPAQUE_POINTER: [addrspace]
 
   TYPE_CODE_TARGET_TYPE = 26, // TARGET_TYPE
+
+  TYPE_CODE_BYTE = 27, // BYTE: [width]
 };
 
 enum OperandBundleTagCode {
@@ -440,6 +442,8 @@ enum ConstantsCodes {
   CST_CODE_PTRAUTH = 33,              // [ptr, key, disc, addrdisc]
   CST_CODE_PTRAUTH2 = 34,             // [ptr, key, disc, addrdisc,
                                       //  deactivation_symbol]
+  CST_CODE_BYTE = 35,                 // BYTE:          [intval]
+  CST_CODE_WIDE_BYTE = 36,            // WIDE_BYTE:     [n x intval]
 };
 
 /// CastOpcodes - These are values used in the bitcode files to encode which
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index a846aad90bc2b..fa578f733d4e8 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1577,6 +1577,13 @@ enum NodeType {
   /// Output: Output Chain
   EXPERIMENTAL_VECTOR_HISTOGRAM,
 
+  /// Returns the number of number of trailing (least significant) zero elements
+  /// in a vector. Has a single i1 vector operand. The result is poison if the
+  /// return type isn't wide enough to hold the maximum number of elements in
+  /// the input vector.
+  CTTZ_ELTS,
+  CTTZ_ELTS_ZERO_POISON,
+
   /// Finds the index of the last active mask element
   /// Operands: Mask
   VECTOR_FIND_LAST_ACTIVE,
diff --git a/llvm/include/llvm/CodeGen/MachineInstrBundle.h b/llvm/include/llvm/CodeGen/MachineInstrBundle.h
index 65eb5c4cf7c25..6c5063fb1f063 100644
--- a/llvm/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/llvm/include/llvm/CodeGen/MachineInstrBundle.h
@@ -301,6 +301,20 @@ class FinalizeBundleTestPass : public PassInfoMixin<FinalizeBundleTestPass> {
                                  MachineFunctionAnalysisManager &MFAM);
 };
 
+class UnpackMachineBundlesPass
+    : public PassInfoMixin<UnpackMachineBundlesPass> {
+
+public:
+  UnpackMachineBundlesPass(
+      std::function<bool(const MachineFunction &)> Ftor = nullptr)
+      : PredicateFtor(std::move(Ftor)) {}
+  PreservedAnalyses LLVM_ABI run(MachineFunction &MF,
+                                 MachineFunctionAnalysisManager &MFAM);
+
+private:
+  std::function<bool(const MachineFunction &)> PredicateFtor;
+};
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 6f7c3bd177cb0..396c40d360eef 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -454,8 +454,8 @@ LLVM_ABI extern char &FinalizeISelID;
 /// UnpackMachineBundles - This pass unpack machine instruction bundles.
 LLVM_ABI extern char &UnpackMachineBundlesID;
 
-LLVM_ABI FunctionPass *
-createUnpackMachineBundles(std::function<bool(const MachineFunction &)> Ftor);
+LLVM_ABI FunctionPass *createUnpackMachineBundlesLegacy(
+    std::function<bool(const MachineFunction &)> Ftor);
 
 /// StackMapLiveness - This pass analyses the register live-out set of
 /// stackmap/patchpoint intrinsics and attaches the calculated information to
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index ce83de8e4cba9..94fde52de11b3 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -242,6 +242,9 @@
 /* Define if ICU library is available */
 #cmakedefine01 HAVE_ICU
 
+/* Define if Windows vendored ICU is available */
+#cmakedefine01 HAVE_WINDOWS_ICU
+
 /* Define if iconv library is available */
 #cmakedefine01 HAVE_ICONV
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index cb688959f7519..d1dddf76152ec 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -453,7 +453,7 @@ def OMPC_Partial: Clause<[Spelling<"partial">]> {
 }
 def OMPC_Permutation: Clause<[Spelling<"permutation">]> {
   let clangClass = "OMPPermutationClause";
-  let flangClass = "ScalarIntExpr";
+  let flangClass = "ScalarIntConstantExpr";
   let isValueList = true;
 }
 def OMPC_Priority : Clause<[Spelling<"priority">]> {
@@ -829,6 +829,7 @@ def OMP_DeclareTarget : Directive<[Spelling<"declare target", 1, 52>,
     VersionedClause<OMPC_Enter, 52>,
     VersionedClause<OMPC_Indirect, 51>,
     VersionedClause<OMPC_Link>,
+    VersionedClause<OMPC_Local, 60>,
     VersionedClause<OMPC_To>,
   ];
   let allowedOnceClauses = [
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 9885ffc8b2065..042c3c75e9cb8 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
 #define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
 
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Frontend/Atomic/Atomic.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
@@ -1510,6 +1511,16 @@ class OpenMPIRBuilder {
         : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
   };
 
+  /// Return the LLVM struct type matching runtime `kmp_task_affinity_info_t`.
+  /// `{ kmp_intptr_t base_addr; size_t len; flags (bitfield storage as i32) }`
+  LLVM_ABI llvm::StructType *getKmpTaskAffinityInfoTy();
+
+  /// A struct to pack the relevant information for an OpenMP affinity clause.
+  struct AffinityData {
+    Value *Count; // number of kmp_task_affinity_info_t entries
+    Value *Info;  // kmp_task_affinity_info_t
+  };
+
   /// Generator for `#omp taskloop`
   ///
   /// \param Loc The location where the taskloop construct was encountered.
@@ -1568,17 +1579,21 @@ class OpenMPIRBuilder {
   ///                    cannot be resumed until execution of the structured
   ///                    block that is associated with the generated task is
   ///                    completed.
+  /// \param Dependencies Vector of DependData objects holding information of
+  ///        dependencies as specified by the 'depend' clause.
+  /// \param Affinities AffinityData object holding information of accumulated
+  ///        affinities as specified by the 'affinity' clause.
   /// \param EventHandle If present, signifies the event handle as part of
   ///			 the detach clause
   /// \param Mergeable	 If the given task is `mergeable`
   /// \param priority `priority-value' specifies the execution order of the
   ///                 tasks that is generated by the construct
-  LLVM_ABI InsertPointOrErrorTy
-  createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
-             BodyGenCallbackTy BodyGenCB, bool Tied = true,
-             Value *Final = nullptr, Value *IfCondition = nullptr,
-             SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
-             Value *EventHandle = nullptr, Value *Priority = nullptr);
+  LLVM_ABI InsertPointOrErrorTy createTask(
+      const LocationDescription &Loc, InsertPointTy AllocaIP,
+      BodyGenCallbackTy BodyGenCB, bool Tied = true, Value *Final = nullptr,
+      Value *IfCondition = nullptr, SmallVector<DependData> Dependencies = {},
+      AffinityData Affinities = {}, bool Mergeable = false,
+      Value *EventHandle = nullptr, Value *Priority = nullptr);
 
   /// Generator for the taskgroup construct
   ///
@@ -3253,7 +3268,7 @@ class OpenMPIRBuilder {
   /// The `omp target` interface
   ///
   /// For more information about the usage of this interface,
-  /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
+  /// \see openmp/device/include/Interface.h
   ///
   ///{
 
@@ -3926,6 +3941,109 @@ class OpenMPIRBuilder {
   LLVM_ABI GlobalVariable *
   getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
                               std::optional<unsigned> AddressSpace = {});
+
+  using IteratorBodyGenTy = llvm::function_ref<llvm::Error(
+      InsertPointTy BodyIP, llvm::Value *LinearIV)>;
+
+  /// Create a canonical iterator loop at the current insertion point.
+  ///
+  /// This helper splits the current block and builds a canonical loop
+  /// using createLoopSkeleton(). The resulting control flow looks like:
+  ///
+  ///   CurBB -> Preheader -> Header -> Body -> Latch -> After -> ContBB
+  ///
+  /// The body of the loop is produced by calling \p BodyGen with the insertion
+  /// point for the loop body and the induction variable.
+  /// Unlike createCanonicalLoop(), this function is intended for \p BodyGen
+  /// that may perform region lowering (e.g., translating MLIR regions) and are
+  /// not guaranteed to preserve the canonical skeleton's body terminator. In
+  /// particular:
+  ///
+  ///  - The skeleton’s unconditional branch from the loop body is removed
+  ///    before invoking \p BodyGen.
+  ///  - \p BodyGen may freely emit instructions and temporarily introduce
+  ///    control flow.
+  ///  - If the loop body does not end with a terminator after \p BodyGen
+  ///    returns, a branch to the latch is inserted to restore canonical form.
+  ///
+  /// \param Loc The location where the iterator modifier was encountered.
+  /// \param TripCount Number of loop iterations.
+  /// \param BodyGen Callback to generate the loop body.
+  /// \param Name Base name used for creating the loop
+  /// \returns The insertion position *after* the iterator loop
+  LLVM_ABI InsertPointOrErrorTy createIteratorLoop(
+      LocationDescription Loc, llvm::Value *TripCount,
+      IteratorBodyGenTy BodyGen, llvm::StringRef Name = "iterator");
+
+  /// Kind of parameter in a function with 'declare simd' directive.
+  enum class DeclareSimdKindTy {
+    Linear,
+    LinearRef,
+    LinearUVal,
+    LinearVal,
+    Uniform,
+    Vector,
+  };
+
+  /// Attribute set of the `declare simd` parameter.
+  struct DeclareSimdAttrTy {
+    DeclareSimdKindTy Kind = DeclareSimdKindTy::Vector;
+    llvm::APSInt StrideOrArg;
+    llvm::APSInt Alignment;
+    bool HasVarStride = false;
+  };
+
+  enum class DeclareSimdBranch {
+    Undefined,
+    Inbranch,
+    Notinbranch,
+  };
+
+  /// Emit x86 vector-function ABI attributes for a `declare simd` function.
+  ///
+  /// Generates and attaches `_ZGV*` vector function ABI attributes to \p Fn
+  /// following the x86 vector ABI used by OpenMP `declare simd`. For each
+  /// supported ISA (SSE, AVX, AVX2, AVX512) and masking variant, this
+  /// constructs the appropriate mangled vector-function name and adds it as a
+  /// function attribute.
+  ///
+  /// \param Fn          The scalar function to which vector-function attributes
+  ///                    are attached.
+  /// \param NumElements Number of elements used to derive the vector length
+  ///                    when
+  ///                    \p VLENVal is not specified.
+  /// \param VLENVal     User provided vector length.
+  /// \param ParamAttrs  Array of attribute set of the `declare simd` parameter.
+  /// \param Branch      `undefined`, `inbranch` or `notinbranch` clause.
+  LLVM_ABI void emitX86DeclareSimdFunction(
+      llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal,
+      llvm::ArrayRef<DeclareSimdAttrTy> ParamAttrs, DeclareSimdBranch Branch);
+
+  /// Emit AArch64 vector-function ABI attributes for a `declare simd` function.
+  ///
+  /// Generates and attaches `_ZGV*` vector function ABI attributes to \p Fn
+  /// following the AArch64 vector-function ABI. The emitted names depend on the
+  /// selected ISA, user-specified vector length, parameter attribute mangling,
+  /// and the declare simd branch clause.
+  ///
+  /// \param Fn                  The scalar function to which vector-function
+  ///                            attributes are attached.
+  /// \param VLENVal             User provided vector length.
+  /// \param ParamAttrs          Array of attribute set of the `declare simd`
+  ///                            parameter.
+  /// \param Branch              `undefined`, `inbranch` or `notinbranch`
+  ///                            clause.
+  /// \param ISA                 `'n'` for Advanced SIMD or `'s'` for SVE.
+  /// \param NarrowestDataSize   Narrowest data size in bits used to infer the
+  ///                            default vector length when \p VLENVal is
+  ///                            absent.
+  /// \param OutputBecomesInput  Whether result values are represented as input
+  ///                            parameters in the emitted vector-function ABI
+  ///                            name.
+  LLVM_ABI void emitAArch64DeclareSimdFunction(
+      llvm::Function *Fn, unsigned VLENVal,
+      llvm::ArrayRef<DeclareSimdAttrTy> ParamAttrs, DeclareSimdBranch Branch,
+      char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput);
 };
 
 /// Class to represented the control flow structure of an OpenMP canonical loop.
diff --git a/llvm/include/llvm/IR/CFG.h b/llvm/include/llvm/IR/CFG.h
index 96d3b2fbb5b0b..c68c2f2e53d81 100644
--- a/llvm/include/llvm/IR/CFG.h
+++ b/llvm/include/llvm/IR/CFG.h
@@ -135,124 +135,20 @@ inline const_pred_range predecessors(const BasicBlock *BB) {
 // Instruction and BasicBlock succ_iterator helpers
 //===----------------------------------------------------------------------===//
 
-template <class InstructionT, class BlockT>
-class SuccIterator
-    : public iterator_facade_base<SuccIterator<InstructionT, BlockT>,
-                                  std::random_access_iterator_tag, BlockT, int,
-                                  BlockT *, BlockT *> {
-public:
-  using value_type = BlockT *;
-  using difference_type = std::ptrdiff_t;
-  using pointer = BlockT *;
-  using reference = BlockT *;
-
-private:
-  InstructionT *Inst;
-  int Idx;
-  using Self = SuccIterator<InstructionT, BlockT>;
-
-  inline bool index_is_valid(int Idx) {
-    // Note that we specially support the index of zero being valid even in the
-    // face of a null instruction.
-    return Idx >= 0 && (Idx == 0 || Idx <= (int)Inst->getNumSuccessors());
-  }
-
-  /// Proxy object to allow write access in operator[]
-  class SuccessorProxy {
-    Self It;
-
-  public:
-    explicit SuccessorProxy(const Self &It) : It(It) {}
-
-    SuccessorProxy(const SuccessorProxy &) = default;
-
-    SuccessorProxy &operator=(SuccessorProxy RHS) {
-      *this = reference(RHS);
-      return *this;
-    }
-
-    SuccessorProxy &operator=(reference RHS) {
-      It.Inst->setSuccessor(It.Idx, RHS);
-      return *this;
-    }
-
-    operator reference() const { return *It; }
-  };
-
-public:
-  // begin iterator
-  explicit inline SuccIterator(InstructionT *Inst) : Inst(Inst), Idx(0) {}
-  // end iterator
-  inline SuccIterator(InstructionT *Inst, bool) : Inst(Inst) {
-    if (Inst)
-      Idx = Inst->getNumSuccessors();
-    else
-      // Inst == NULL happens, if a basic block is not fully constructed and
-      // consequently getTerminator() returns NULL. In this case we construct
-      // a SuccIterator which describes a basic block that has zero
-      // successors.
-      // Defining SuccIterator for incomplete and malformed CFGs is especially
-      // useful for debugging.
-      Idx = 0;
-  }
-
-  /// This is used to interface between code that wants to
-  /// operate on terminator instructions directly.
-  int getSuccessorIndex() const { return Idx; }
-
-  inline bool operator==(const Self &x) const { return Idx == x.Idx; }
-
-  inline BlockT *operator*() const { return Inst->getSuccessor(Idx); }
-
-  // We use the basic block pointer directly for operator->.
-  inline BlockT *operator->() const { return operator*(); }
-
-  inline bool operator<(const Self &RHS) const {
-    assert(Inst == RHS.Inst && "Cannot compare iterators of different blocks!");
-    return Idx < RHS.Idx;
-  }
-
-  int operator-(const Self &RHS) const {
-    assert(Inst == RHS.Inst && "Cannot compare iterators of different blocks!");
-    return Idx - RHS.Idx;
-  }
-
-  inline Self &operator+=(int RHS) {
-    int NewIdx = Idx + RHS;
-    assert(index_is_valid(NewIdx) && "Iterator index out of bound");
-    Idx = NewIdx;
-    return *this;
-  }
-
-  inline Self &operator-=(int RHS) { return operator+=(-RHS); }
-
-  // Specially implement the [] operation using a proxy object to support
-  // assignment.
-  inline SuccessorProxy operator[](int Offset) {
-    Self TmpIt = *this;
-    TmpIt += Offset;
-    return SuccessorProxy(TmpIt);
-  }
-
-  /// Get the source BlockT of this iterator.
-  inline BlockT *getSource() {
-    assert(Inst && "Source not available, if basic block was malformed");
-    return Inst->getParent();
-  }
-};
-
-using succ_iterator = SuccIterator<Instruction, BasicBlock>;
-using const_succ_iterator = SuccIterator<const Instruction, const BasicBlock>;
+using succ_iterator = Instruction::succ_iterator;
+using const_succ_iterator = Instruction::const_succ_iterator;
 using succ_range = iterator_range<succ_iterator>;
 using const_succ_range = iterator_range<const_succ_iterator>;
 
-inline succ_iterator succ_begin(Instruction *I) { return succ_iterator(I); }
+inline succ_iterator succ_begin(Instruction *I) {
+  return I->successors().begin();
+}
 inline const_succ_iterator succ_begin(const Instruction *I) {
-  return const_succ_iterator(I);
+  return I->successors().begin();
 }
-inline succ_iterator succ_end(Instruction *I) { return succ_iterator(I, true); }
+inline succ_iterator succ_end(Instruction *I) { return I->successors().end(); }
 inline const_succ_iterator succ_end(const Instruction *I) {
-  return const_succ_iterator(I, true);
+  return I->successors().end();
 }
 inline bool succ_empty(const Instruction *I) {
   return succ_begin(I) == succ_end(I);
@@ -260,24 +156,22 @@ inline bool succ_empty(const Instruction *I) {
 inline unsigned succ_size(const Instruction *I) {
   return std::distance(succ_begin(I), succ_end(I));
 }
-inline succ_range successors(Instruction *I) {
-  return succ_range(succ_begin(I), succ_end(I));
-}
+inline succ_range successors(Instruction *I) { return I->successors(); }
 inline const_succ_range successors(const Instruction *I) {
-  return const_succ_range(succ_begin(I), succ_end(I));
+  return I->successors();
 }
 
 inline succ_iterator succ_begin(BasicBlock *BB) {
-  return succ_iterator(BB->getTerminator());
+  return succ_begin(BB->getTerminator());
 }
 inline const_succ_iterator succ_begin(const BasicBlock *BB) {
-  return const_succ_iterator(BB->getTerminator());
+  return succ_begin(BB->getTerminator());
 }
 inline succ_iterator succ_end(BasicBlock *BB) {
-  return succ_iterator(BB->getTerminator(), true);
+  return succ_end(BB->getTerminator());
 }
 inline const_succ_iterator succ_end(const BasicBlock *BB) {
-  return const_succ_iterator(BB->getTerminator(), true);
+  return succ_end(BB->getTerminator());
 }
 inline bool succ_empty(const BasicBlock *BB) {
   return succ_begin(BB) == succ_end(BB);
@@ -286,10 +180,10 @@ inline unsigned succ_size(const BasicBlock *BB) {
   return std::distance(succ_begin(BB), succ_end(BB));
 }
 inline succ_range successors(BasicBlock *BB) {
-  return succ_range(succ_begin(BB), succ_end(BB));
+  return successors(BB->getTerminator());
 }
 inline const_succ_range successors(const BasicBlock *BB) {
-  return const_succ_range(succ_begin(BB), succ_end(BB));
+  return successors(BB->getTerminator());
 }
 
 //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Constants.h b/llvm/include/llvm/IR/Constants.h
index a1f667353b26e..2b3f4119e50e2 100644
--- a/llvm/include/llvm/IR/Constants.h
+++ b/llvm/include/llvm/IR/Constants.h
@@ -276,6 +276,144 @@ class ConstantInt final : public ConstantData {
   }
 };
 
+//===----------------------------------------------------------------------===//
+/// Class for constant bytes.
+class ConstantByte final : public ConstantData {
+  friend class Constant;
+  friend class ConstantVector;
+
+  APInt Val;
+
+  ConstantByte(Type *Ty, const APInt &V);
+
+  void destroyConstantImpl();
+
+  /// Return a ConstantByte with the specified value and an implied Type. The
+  /// type is the vector type whose byte element type corresponds to the bit
+  /// width of the value.
+  static ConstantByte *get(LLVMContext &Context, ElementCount EC,
+                           const APInt &V);
+
+public:
+  ConstantByte(const ConstantByte &) = delete;
+
+  /// If Ty is a vector type, return a Constant with a splat of the given
+  /// value. Otherwise return a ConstantByte for the given value.
+  /// \param ImplicitTrunc Whether to allow implicit truncation of the value.
+  LLVM_ABI static Constant *get(Type *Ty, uint64_t V, bool isSigned = false,
+                                bool ImplicitTrunc = false);
+
+  /// Return a ConstantByte with the specified byte value for the specified
+  /// type. If the type is wider than 64 bits, the value will be zero-extended
+  /// to fit the type, unless IsSigned is true, in which case the value will
+  /// be interpreted as a 64-bit signed byte and sign-extended to fit
+  /// the type.
+  /// \param ImplicitTrunc Whether to allow implicit truncation of the value.
+  LLVM_ABI static ConstantByte *get(ByteType *Ty, uint64_t V,
+                                    bool isSigned = false,
+                                    bool ImplicitTrunc = false);
+
+  /// Return a ConstantByte with the specified value for the specified type. The
+  /// value V will be canonicalized to an unsigned APInt. Accessing it with
+  /// either getSExtValue() or getZExtValue() will yield a correctly sized and
+  /// signed value for the type Ty.
+  /// Get a ConstantByte for a specific signed value.
+  /// \param ImplicitTrunc Whether to allow implicit truncation of the value.
+  static ConstantByte *getSigned(ByteType *Ty, int64_t V,
+                                 bool ImplicitTrunc = false) {
+    return get(Ty, V, /*IsSigned=*/true, ImplicitTrunc);
+  }
+  static Constant *getSigned(Type *Ty, int64_t V, bool ImplicitTrunc = false) {
+    return get(Ty, V, /*IsSigned=*/true, ImplicitTrunc);
+  }
+
+  /// Return a ConstantByte with the specified value and an implied Type. The
+  /// type is the byte type that corresponds to the bit width of the value.
+  LLVM_ABI static ConstantByte *get(LLVMContext &Context, const APInt &V);
+
+  /// Return a ConstantByte constructed from the string strStart with the given
+  /// radix.
+  LLVM_ABI static ConstantByte *get(ByteType *Ty, StringRef Str, uint8_t Radix);
+
+  /// If Ty is a vector type, return a Constant with a splat of the given
+  /// value. Otherwise return a ConstantByte for the given value.
+  LLVM_ABI static Constant *get(Type *Ty, const APInt &V);
+
+  /// Return the constant as an APInt value reference. This allows clients to
+  /// obtain a full-precision copy of the value.
+  /// Return the constant's value.
+  inline const APInt &getValue() const { return Val; }
+
+  /// getBitWidth - Return the scalar bitwidth of this constant.
+  unsigned getBitWidth() const { return Val.getBitWidth(); }
+
+  /// Return the constant as a 64-bit byte value after it
+  /// has been zero extended as appropriate for the type of this constant. Note
+  /// that this method can assert if the value does not fit in 64 bits.
+  /// Return the zero extended value.
+  inline uint64_t getZExtValue() const { return Val.getZExtValue(); }
+
+  /// Return the constant as a 64-bit byte value after it has been sign
+  /// extended as appropriate for the type of this constant. Note that
+  /// this method can assert if the value does not fit in 64 bits.
+  /// Return the sign extended value.
+  inline int64_t getSExtValue() const { return Val.getSExtValue(); }
+
+  /// Variant of the getType() method to always return a ByteType, which
+  /// reduces the amount of casting needed in parts of the compiler.
+  inline ByteType *getByteType() const {
+    return cast<ByteType>(Value::getType());
+  }
+
+  bool isNegative() const { return Val.isNegative(); }
+
+  /// This is just a convenience method to make client code smaller for a
+  /// common code. It also correctly performs the comparison without the
+  /// potential for an assertion from getZExtValue().
+  bool isZero() const { return Val.isZero(); }
+
+  /// This is just a convenience method to make client code smaller for a
+  /// common case. It also correctly performs the comparison without the
+  /// potential for an assertion from getZExtValue().
+  /// Determine if the value is one.
+  bool isOne() const { return Val.isOne(); }
+
+  /// This function will return true iff every bit in this constant is set
+  /// to true.
+  /// @returns true iff this constant's bits are all set to true.
+  /// Determine if the value is all ones.
+  bool isMinusOne() const { return Val.isAllOnes(); }
+
+  /// This function will return true iff this constant represents the largest
+  /// value that may be represented by the constant's type.
+  /// @returns true iff this is the largest value that may be represented
+  /// by this type.
+  /// Determine if the value is maximal.
+  bool isMaxValue(bool IsSigned) const {
+    if (IsSigned)
+      return Val.isMaxSignedValue();
+    else
+      return Val.isMaxValue();
+  }
+
+  /// This function will return true iff this constant represents the smallest
+  /// value that may be represented by this constant's type.
+  /// @returns true if this is the smallest value that may be represented by
+  /// this type.
+  /// Determine if the value is minimal.
+  bool isMinValue(bool IsSigned) const {
+    if (IsSigned)
+      return Val.isMinSignedValue();
+    else
+      return Val.isMinValue();
+  }
+
+  /// Methods to support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(const Value *V) {
+    return V->getValueID() == ConstantByteVal;
+  }
+};
+
 //===----------------------------------------------------------------------===//
 /// ConstantFP - Floating Point Values [float, double]
 ///
@@ -588,10 +726,10 @@ class ConstantPointerNull final : public ConstantData {
 
 //===----------------------------------------------------------------------===//
 /// ConstantDataSequential - A vector or array constant whose element type is a
-/// simple 1/2/4/8-byte integer or half/bfloat/float/double, and whose elements
-/// are just simple data values (i.e. ConstantInt/ConstantFP).  This Constant
-/// node has no operands because it stores all of the elements of the constant
-/// as densely packed data, instead of as Value*'s.
+/// simple 1/2/4/8-byte integer/byte or half/bfloat/float/double, and whose
+/// elements are just simple data values (i.e. ConstantInt/ConstantByte/
+/// ConstantFP).  This Constant node has no operands because it stores all of
+/// the elements of the constant as densely packed data, instead of as Value*'s.
 ///
 /// This is the common base class of ConstantDataArray and ConstantDataVector.
 ///
@@ -661,7 +799,8 @@ class ConstantDataSequential : public ConstantData {
   /// The size of the elements is known to be a multiple of one byte.
   LLVM_ABI uint64_t getElementByteSize() const;
 
-  /// This method returns true if this is an array of \p CharSize integers.
+  /// This method returns true if this is an array of \p CharSize integers or
+  /// bytes.
   LLVM_ABI bool isString(unsigned CharSize = 8) const;
 
   /// This method returns true if the array "isString", ends with a null byte,
@@ -699,8 +838,8 @@ class ConstantDataSequential : public ConstantData {
 };
 
 //===----------------------------------------------------------------------===//
-/// An array constant whose element type is a simple 1/2/4/8-byte integer or
-/// float/double, and whose elements are just simple data values
+/// An array constant whose element type is a simple 1/2/4/8-byte integer, bytes
+///  or float/double, and whose elements are just simple data values
 /// (i.e. ConstantInt/ConstantFP). This Constant node has no operands because it
 /// stores all of the elements of the constant as densely packed data, instead
 /// of as Value*'s.
@@ -733,9 +872,9 @@ class ConstantDataArray final : public ConstantDataSequential {
   /// getRaw() constructor - Return a constant with array type with an element
   /// count and element type matching the NumElements and ElementTy parameters
   /// passed in. Note that this can return a ConstantAggregateZero object.
-  /// ElementTy must be one of i8/i16/i32/i64/half/bfloat/float/double. Data is
-  /// the buffer containing the elements. Be careful to make sure Data uses the
-  /// right endianness, the buffer will be used as-is.
+  /// ElementTy must be one of i8/i16/i32/i64/b8/b16/b32/b64/half/bfloat/float/
+  ///  double. Data is the buffer containing the elements. Be careful to make
+  /// sure Data uses the right endianness, the buffer will be used as-is.
   static Constant *getRaw(StringRef Data, uint64_t NumElements,
                           Type *ElementTy) {
     Type *Ty = ArrayType::get(ElementTy, NumElements);
@@ -752,13 +891,25 @@ class ConstantDataArray final : public ConstantDataSequential {
   LLVM_ABI static Constant *getFP(Type *ElementType, ArrayRef<uint32_t> Elts);
   LLVM_ABI static Constant *getFP(Type *ElementType, ArrayRef<uint64_t> Elts);
 
+  /// getByte() constructors - Return a constant of array type with a byte
+  /// element type taken from argument `ElementType', and count taken from
+  /// argument `Elts'.  The amount of bits of the contained type must match the
+  /// number of bits of the type contained in the passed in ArrayRef.
+  /// Note that this can return a ConstantAggregateZero object.
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint8_t> Elts);
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint16_t> Elts);
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint32_t> Elts);
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint64_t> Elts);
+
   /// This method constructs a CDS and initializes it with a text string.
   /// The default behavior (AddNull==true) causes a null terminator to
   /// be placed at the end of the array (increasing the length of the string by
   /// one more than the StringRef would normally indicate.  Pass AddNull=false
   /// to disable this behavior.
-  LLVM_ABI static Constant *
-  getString(LLVMContext &Context, StringRef Initializer, bool AddNull = true);
+  LLVM_ABI static Constant *getString(LLVMContext &Context,
+                                      StringRef Initializer,
+                                      bool AddNull = true,
+                                      bool ByteString = false);
 
   /// Specialize the getType() method to always return an ArrayType,
   /// which reduces the amount of casting needed in parts of the compiler.
@@ -805,15 +956,24 @@ class ConstantDataVector final : public ConstantDataSequential {
   /// getRaw() constructor - Return a constant with vector type with an element
   /// count and element type matching the NumElements and ElementTy parameters
   /// passed in. Note that this can return a ConstantAggregateZero object.
-  /// ElementTy must be one of i8/i16/i32/i64/half/bfloat/float/double. Data is
-  /// the buffer containing the elements. Be careful to make sure Data uses the
-  /// right endianness, the buffer will be used as-is.
+  /// ElementTy must be one of i8/i16/i32/i64/b8/b16/b32/b64/half/bfloat/float/
+  /// double. Data is the buffer containing the elements. Be careful to make
+  /// sure Data uses the right endianness, the buffer will be used as-is.
   static Constant *getRaw(StringRef Data, uint64_t NumElements,
                           Type *ElementTy) {
     Type *Ty = VectorType::get(ElementTy, ElementCount::getFixed(NumElements));
     return getImpl(Data, Ty);
   }
 
+  /// getByte() constructors - Return a constant of vector type with a byte
+  /// element type taken from argument `ElementType', and count taken from
+  /// argument `Elts'.  The amount of bits of the contained type must match the
+  /// number of bits of the type contained in the passed in ArrayRef.
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint8_t> Elts);
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint16_t> Elts);
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint32_t> Elts);
+  LLVM_ABI static Constant *getByte(Type *ElementType, ArrayRef<uint64_t> Elts);
+
   /// getFP() constructors - Return a constant of vector type with a float
   /// element type taken from argument `ElementType', and count taken from
   /// argument `Elts'.  The amount of bits of the contained type must match the
@@ -826,7 +986,8 @@ class ConstantDataVector final : public ConstantDataSequential {
 
   /// Return a ConstantVector with the specified constant in each element.
   /// The specified constant has to be a of a compatible type (i8/i16/
-  /// i32/i64/half/bfloat/float/double) and must be a ConstantFP or ConstantInt.
+  /// i32/i64/b8/b16/b32/b64/half/bfloat/float/double) and must be a ConstantFP,
+  /// ConstantByte or ConstantInt.
   LLVM_ABI static Constant *getSplat(unsigned NumElts, Constant *Elt);
 
   /// Returns true if this is a splat constant, meaning that all elements have
diff --git a/llvm/include/llvm/IR/DataLayout.h b/llvm/include/llvm/IR/DataLayout.h
index 583963ab12e26..9c0ee38635b8e 100644
--- a/llvm/include/llvm/IR/DataLayout.h
+++ b/llvm/include/llvm/IR/DataLayout.h
@@ -641,6 +641,11 @@ class DataLayout {
   /// This is always at least as good as the ABI alignment.
   LLVM_ABI Align getPrefTypeAlign(Type *Ty) const;
 
+  /// Returns a byte type with the same size of a pointer in the given address
+  /// space.
+  LLVM_ABI ByteType *getBytePtrType(LLVMContext &C,
+                                    unsigned AddressSpace = 0) const;
+
   /// Returns an integer type with size at least as big as that of a
   /// pointer in the given address space.
   LLVM_ABI IntegerType *getIntPtrType(LLVMContext &C,
@@ -650,6 +655,10 @@ class DataLayout {
   /// big as that of a pointer of the given pointer (vector of pointer) type.
   LLVM_ABI Type *getIntPtrType(Type *) const;
 
+  /// Returns a byte (vector of byte) type with the same size of a pointer of
+  /// the given pointer (vector of pointer) type.
+  LLVM_ABI Type *getBytePtrType(Type *) const;
+
   /// Returns the smallest integer type with size at least as big as
   /// Width bits.
   LLVM_ABI Type *getSmallestLegalIntType(LLVMContext &C,
@@ -788,6 +797,8 @@ inline TypeSize DataLayout::getTypeSizeInBits(Type *Ty) const {
   case Type::StructTyID:
     // Get the layout annotation... which is lazily created on demand.
     return getStructLayout(cast<StructType>(Ty))->getSizeInBits();
+  case Type::ByteTyID:
+    return TypeSize::getFixed(Ty->getByteBitWidth());
   case Type::IntegerTyID:
     return TypeSize::getFixed(Ty->getIntegerBitWidth());
   case Type::HalfTyID:
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index b22177c9e6c2d..c272f091e31a4 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -100,6 +100,50 @@ unsigned Type::getIntegerBitWidth() const {
   return cast<IntegerType>(this)->getBitWidth();
 }
 
+/// Class to represent byte types.
+class ByteType : public Type {
+  friend class LLVMContextImpl;
+
+protected:
+  explicit ByteType(LLVMContext &C, unsigned NumBits) : Type(C, ByteTyID) {
+    setSubclassData(NumBits);
+  }
+
+public:
+  /// This enum is just used to hold constants we need for ByteType.
+  enum {
+    MIN_BYTE_BITS = 1, ///< Minimum number of bits that can be specified
+    MAX_BYTE_BITS =
+        (1 << 23) ///< Maximum number of bits that can be specified
+                  ///< Note that bit width is stored in the Type classes
+                  ///< SubclassData field which has 24 bits. SelectionDAG type
+                  ///< legalization can require a power of 2 ByteType, so limit
+                  ///< to the largest representable power of 2, 8388608.
+  };
+
+  /// This static method is the primary way of constructing a ByteType.
+  /// If a ByteType with the same NumBits value was previously instantiated,
+  /// that instance will be returned. Otherwise a new one will be created. Only
+  /// one instance with a given NumBits value is ever created.
+  /// Get or create a ByteType instance.
+  LLVM_ABI static ByteType *get(LLVMContext &C, unsigned NumBits);
+
+  /// Get the number of bits in this ByteType
+  unsigned getBitWidth() const { return getSubclassData(); }
+
+  /// For example, this is 0xFF for an 8 bit byte, 0xFFFF for b16, etc.
+  /// @returns a bit mask with ones set for all the bits of this type.
+  /// Get a bit mask for this type.
+  LLVM_ABI APInt getMask() const;
+
+  /// Methods for support type inquiry through isa, cast, and dyn_cast.
+  static bool classof(const Type *T) { return T->getTypeID() == ByteTyID; }
+};
+
+unsigned Type::getByteBitWidth() const {
+  return cast<ByteType>(this)->getBitWidth();
+}
+
 /// Class to represent function types
 ///
 class FunctionType : public Type {
diff --git a/llvm/include/llvm/IR/Dominators.h b/llvm/include/llvm/IR/Dominators.h
index 1209def5ac0bd..d860c1453bd73 100644
--- a/llvm/include/llvm/IR/Dominators.h
+++ b/llvm/include/llvm/IR/Dominators.h
@@ -111,12 +111,7 @@ class BasicBlockEdge {
     return Start;
   }
 
-  const BasicBlock *getEnd() const {
-    return End;
-  }
-
-  /// Check if this is the only edge between Start and End.
-  LLVM_ABI bool isSingleEdge() const;
+  const BasicBlock *getEnd() const { return End; }
 };
 
 template <> struct DenseMapInfo<BasicBlockEdge> {
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index b0bf40304fc5d..4ed3d73c4a057 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -542,6 +542,24 @@ class IRBuilderBase {
   // Type creation methods
   //===--------------------------------------------------------------------===//
 
+  /// Fetch the type representing an 8-bit byte.
+  ByteType *getByte8Ty() { return Type::getByte8Ty(Context); }
+
+  /// Fetch the type representing a 16-bit byte.
+  ByteType *getByte16Ty() { return Type::getByte16Ty(Context); }
+
+  /// Fetch the type representing a 32-bit byte.
+  ByteType *getByte32Ty() { return Type::getByte32Ty(Context); }
+
+  /// Fetch the type representing a 64-bit byte.
+  ByteType *getByte64Ty() { return Type::getByte64Ty(Context); }
+
+  /// Fetch the type representing a 128-bit byte.
+  ByteType *getByte128Ty() { return Type::getByte128Ty(Context); }
+
+  /// Fetch the type representing an N-bit byte.
+  ByteType *getByteNTy(unsigned N) { return Type::getByteNTy(Context, N); }
+
   /// Fetch the type representing a single bit
   IntegerType *getInt1Ty() {
     return Type::getInt1Ty(Context);
@@ -605,6 +623,12 @@ class IRBuilderBase {
     return PointerType::get(Context, AddrSpace);
   }
 
+  /// Fetch the type of a byte with size at least as big as that of a
+  /// pointer in the given address space.
+  ByteType *getBytePtrTy(const DataLayout &DL, unsigned AddrSpace = 0) {
+    return DL.getBytePtrType(Context, AddrSpace);
+  }
+
   /// Fetch the type of an integer with size at least as big as that of a
   /// pointer in the given address space.
   IntegerType *getIntPtrTy(const DataLayout &DL, unsigned AddrSpace = 0) {
@@ -1176,17 +1200,16 @@ class IRBuilderBase {
     return Insert(ReturnInst::Create(Context, V));
   }
 
-  /// Create a sequence of N insertvalue instructions,
-  /// with one Value from the retVals array each, that build a aggregate
-  /// return value one value at a time, and a ret instruction to return
-  /// the resulting aggregate value.
+  /// Create a sequence of N insertvalue instructions, with one Value from the
+  /// RetVals array each, that build a aggregate return value one value at a
+  /// time, and a ret instruction to return the resulting aggregate value.
   ///
   /// This is a convenience function for code that uses aggregate return values
   /// as a vehicle for having multiple return values.
-  ReturnInst *CreateAggregateRet(Value *const *retVals, unsigned N) {
+  ReturnInst *CreateAggregateRet(ArrayRef<Value *> RetVals) {
     Value *V = PoisonValue::get(getCurrentFunctionReturnType());
-    for (unsigned i = 0; i != N; ++i)
-      V = CreateInsertValue(V, retVals[i], i, "mrv");
+    for (size_t i = 0, N = RetVals.size(); i != N; ++i)
+      V = CreateInsertValue(V, RetVals[i], i, "mrv");
     return Insert(ReturnInst::Create(Context, V));
   }
 
diff --git a/llvm/include/llvm/IR/InlineAsm.h b/llvm/include/llvm/IR/InlineAsm.h
index fed0ccc5818f8..564f2e7df2dd3 100644
--- a/llvm/include/llvm/IR/InlineAsm.h
+++ b/llvm/include/llvm/IR/InlineAsm.h
@@ -444,6 +444,10 @@ class InlineAsm final : public Value {
     }
   };
 
+  static AsmDialect getDialect(unsigned ExtraInfo) {
+    return ExtraInfo & Extra_AsmDialect ? AD_Intel : AD_ATT;
+  }
+
   static std::vector<StringRef> getExtraInfoNames(unsigned ExtraInfo) {
     std::vector<StringRef> Result;
     if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
@@ -459,9 +463,7 @@ class InlineAsm final : public Value {
     if (ExtraInfo & InlineAsm::Extra_MayUnwind)
       Result.push_back("unwind");
 
-    AsmDialect Dialect =
-        InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect));
-
+    AsmDialect Dialect = getDialect(ExtraInfo);
     if (Dialect == InlineAsm::AD_ATT)
       Result.push_back("attdialect");
     if (Dialect == InlineAsm::AD_Intel)
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 7aea805e0b86b..61dc5ebef1b1d 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -951,11 +951,15 @@ class CmpInst : public Instruction {
 
   /// @returns true if the predicate is unsigned, false otherwise.
   /// Determine if the predicate is an unsigned operation.
-  LLVM_ABI static bool isUnsigned(Predicate predicate);
+  static bool isUnsigned(Predicate Pred) {
+    return Pred >= ICMP_UGT && Pred <= ICMP_ULE;
+  }
 
   /// @returns true if the predicate is signed, false otherwise.
   /// Determine if the predicate is an signed operation.
-  LLVM_ABI static bool isSigned(Predicate predicate);
+  static bool isSigned(Predicate Pred) {
+    return Pred >= ICMP_SGT && Pred <= ICMP_SLE;
+  }
 
   /// Determine if the predicate is an ordered operation.
   LLVM_ABI static bool isOrdered(Predicate predicate);
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 11385666e7ff8..00b63f1e1b060 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -71,6 +71,36 @@ class Instruction : public User,
   using InstListType = SymbolTableList<Instruction, ilist_iterator_bits<true>,
                                        ilist_parent<BasicBlock>>;
 
+  /// Iterator type that casts an operand to a basic block.
+  ///
+  /// All terminators store successors as adjacent operands.
+  struct succ_iterator
+      : iterator_adaptor_base<succ_iterator, op_iterator,
+                              std::random_access_iterator_tag, BasicBlock *,
+                              ptrdiff_t, BasicBlock *, BasicBlock *> {
+    explicit succ_iterator(op_iterator I) : iterator_adaptor_base(I) {}
+
+    BasicBlock *operator*() const { return cast<BasicBlock>(*I); }
+    BasicBlock *operator->() const { return operator*(); }
+
+    op_iterator getUse() const { return I; }
+  };
+
+  /// The const version of `succ_iterator`.
+  struct const_succ_iterator
+      : iterator_adaptor_base<const_succ_iterator, const_op_iterator,
+                              std::random_access_iterator_tag,
+                              const BasicBlock *, ptrdiff_t, const BasicBlock *,
+                              const BasicBlock *> {
+    explicit const_succ_iterator(const_op_iterator I)
+        : iterator_adaptor_base(I) {}
+
+    const BasicBlock *operator*() const { return cast<BasicBlock>(*I); }
+    const BasicBlock *operator->() const { return operator*(); }
+
+    const_op_iterator getUse() const { return I; }
+  };
+
 private:
   DebugLoc DbgLoc;                         // 'dbg' Metadata cache.
 
@@ -975,6 +1005,14 @@ class Instruction : public User,
   /// instruction must be a terminator.
   LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB);
 
+  LLVM_ABI iterator_range<const_succ_iterator> successors() const LLVM_READONLY;
+  LLVM_ABI iterator_range<succ_iterator> successors() {
+    auto Ops = static_cast<const Instruction *>(this)->successors();
+    Use *Begin = const_cast<Use *>(Ops.begin().getUse());
+    Use *End = const_cast<Use *>(Ops.end().getUse());
+    return make_range(succ_iterator(Begin), succ_iterator(End));
+  }
+
   /// Replace specified successor OldBB to point at the provided block.
   /// This instruction must be a terminator.
   LLVM_ABI void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB);
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index f4d7ee9ecb52d..170b2899bcc4d 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -3023,6 +3023,13 @@ class ReturnInst : public Instruction {
     return getNumOperands() != 0 ? getOperand(0) : nullptr;
   }
 
+  iterator_range<succ_iterator> successors() {
+    return {succ_iterator(op_end()), succ_iterator(op_end())};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    return {const_succ_iterator(op_end()), const_succ_iterator(op_end())};
+  }
+
   unsigned getNumSuccessors() const { return 0; }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -3062,33 +3069,6 @@ class BranchInst : public Instruction {
       : Instruction(Ty, Opcode, AllocInfo, InsertBefore) {}
 
 public:
-  /// Iterator type that casts an operand to a basic block.
-  ///
-  /// This only makes sense because the successors are stored as adjacent
-  /// operands for branch instructions.
-  struct succ_op_iterator
-      : iterator_adaptor_base<succ_op_iterator, value_op_iterator,
-                              std::random_access_iterator_tag, BasicBlock *,
-                              ptrdiff_t, BasicBlock *, BasicBlock *> {
-    explicit succ_op_iterator(value_op_iterator I) : iterator_adaptor_base(I) {}
-
-    BasicBlock *operator*() const { return cast<BasicBlock>(*I); }
-    BasicBlock *operator->() const { return operator*(); }
-  };
-
-  /// The const version of `succ_op_iterator`.
-  struct const_succ_op_iterator
-      : iterator_adaptor_base<const_succ_op_iterator, const_value_op_iterator,
-                              std::random_access_iterator_tag,
-                              const BasicBlock *, ptrdiff_t, const BasicBlock *,
-                              const BasicBlock *> {
-    explicit const_succ_op_iterator(const_value_op_iterator I)
-        : iterator_adaptor_base(I) {}
-
-    const BasicBlock *operator*() const { return cast<BasicBlock>(*I); }
-    const BasicBlock *operator->() const { return operator*(); }
-  };
-
   static BranchInst *Create(BasicBlock *IfTrue,
                             InsertPosition InsertBefore = nullptr);
 
@@ -3105,18 +3085,6 @@ class BranchInst : public Instruction {
   Value *getCondition() const;
   void setCondition(Value *V);
 
-  unsigned getNumSuccessors() const { return 1+isConditional(); }
-
-  BasicBlock *getSuccessor(unsigned i) const {
-    assert(i < getNumSuccessors() && "Successor # out of range for Branch!");
-    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
-  }
-
-  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
-    assert(idx < getNumSuccessors() && "Successor # out of range for Branch!");
-    *(&Op<-1>() - idx) = NewSucc;
-  }
-
   /// Swap the successors of this branch instruction.
   ///
   /// Swaps the successors of the branch instruction. This also swaps any
@@ -3124,18 +3092,6 @@ class BranchInst : public Instruction {
   /// continues to map correctly to each operand.
   void swapSuccessors();
 
-  iterator_range<succ_op_iterator> successors() {
-    return make_range(
-        succ_op_iterator(std::next(value_op_begin(), isConditional() ? 1 : 0)),
-        succ_op_iterator(value_op_end()));
-  }
-
-  iterator_range<const_succ_op_iterator> successors() const {
-    return make_range(const_succ_op_iterator(
-                          std::next(value_op_begin(), isConditional() ? 1 : 0)),
-                      const_succ_op_iterator(value_op_end()));
-  }
-
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::UncondBr ||
@@ -3202,14 +3158,13 @@ class UncondBrInst : public BranchInst {
     Op<-1>() = NewSucc;
   }
 
-  iterator_range<succ_op_iterator> successors() {
-    return make_range(succ_op_iterator(value_op_begin()),
-                      succ_op_iterator(value_op_end()));
+  iterator_range<succ_iterator> successors() {
+    return make_range(succ_iterator(op_begin()), succ_iterator(op_end()));
   }
 
-  iterator_range<const_succ_op_iterator> successors() const {
-    return make_range(const_succ_op_iterator(value_op_begin()),
-                      const_succ_op_iterator(value_op_end()));
+  iterator_range<const_succ_iterator> successors() const {
+    return make_range(const_succ_iterator(op_begin()),
+                      const_succ_iterator(op_end()));
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -3271,12 +3226,12 @@ class CondBrInst : public BranchInst {
 
   BasicBlock *getSuccessor(unsigned i) const {
     assert(i < getNumSuccessors() && "Successor # out of range for Branch!");
-    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
+    return cast_or_null<BasicBlock>((&Op<-2>() + i)->get());
   }
 
   void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
     assert(idx < getNumSuccessors() && "Successor # out of range for Branch!");
-    *(&Op<-1>() - idx) = NewSucc;
+    *(&Op<-2>() + idx) = NewSucc;
   }
 
   /// Swap the successors of this branch instruction.
@@ -3286,14 +3241,14 @@ class CondBrInst : public BranchInst {
   /// continues to map correctly to each operand.
   LLVM_ABI void swapSuccessors();
 
-  iterator_range<succ_op_iterator> successors() {
-    return make_range(succ_op_iterator(std::next(value_op_begin())),
-                      succ_op_iterator(value_op_end()));
+  iterator_range<succ_iterator> successors() {
+    return make_range(succ_iterator(std::next(op_begin())),
+                      succ_iterator(op_end()));
   }
 
-  iterator_range<const_succ_op_iterator> successors() const {
-    return make_range(const_succ_op_iterator(std::next(value_op_begin())),
-                      const_succ_op_iterator(value_op_end()));
+  iterator_range<const_succ_iterator> successors() const {
+    return make_range(const_succ_iterator(std::next(op_begin())),
+                      const_succ_iterator(op_end()));
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -3675,6 +3630,13 @@ class SwitchInst : public Instruction {
   /// case.
   LLVM_ABI CaseIt removeCase(CaseIt I);
 
+  iterator_range<succ_iterator> successors() {
+    return make_range(std::next(op_begin()), op_end());
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    return make_range(std::next(op_begin()), op_end());
+  }
+
   unsigned getNumSuccessors() const { return getNumOperands() - 1; }
   BasicBlock *getSuccessor(unsigned idx) const {
     assert(idx < getNumSuccessors() &&"Successor idx out of range for switch!");
@@ -3783,33 +3745,6 @@ class IndirectBrInst : public Instruction {
 public:
   void operator delete(void *Ptr) { User::operator delete(Ptr, AllocMarker); }
 
-  /// Iterator type that casts an operand to a basic block.
-  ///
-  /// This only makes sense because the successors are stored as adjacent
-  /// operands for indirectbr instructions.
-  struct succ_op_iterator
-      : iterator_adaptor_base<succ_op_iterator, value_op_iterator,
-                              std::random_access_iterator_tag, BasicBlock *,
-                              ptrdiff_t, BasicBlock *, BasicBlock *> {
-    explicit succ_op_iterator(value_op_iterator I) : iterator_adaptor_base(I) {}
-
-    BasicBlock *operator*() const { return cast<BasicBlock>(*I); }
-    BasicBlock *operator->() const { return operator*(); }
-  };
-
-  /// The const version of `succ_op_iterator`.
-  struct const_succ_op_iterator
-      : iterator_adaptor_base<const_succ_op_iterator, const_value_op_iterator,
-                              std::random_access_iterator_tag,
-                              const BasicBlock *, ptrdiff_t, const BasicBlock *,
-                              const BasicBlock *> {
-    explicit const_succ_op_iterator(const_value_op_iterator I)
-        : iterator_adaptor_base(I) {}
-
-    const BasicBlock *operator*() const { return cast<BasicBlock>(*I); }
-    const BasicBlock *operator->() const { return operator*(); }
-  };
-
   static IndirectBrInst *Create(Value *Address, unsigned NumDests,
                                 InsertPosition InsertBefore = nullptr) {
     return new IndirectBrInst(Address, NumDests, InsertBefore);
@@ -3847,14 +3782,14 @@ class IndirectBrInst : public Instruction {
     setOperand(i + 1, NewSucc);
   }
 
-  iterator_range<succ_op_iterator> successors() {
-    return make_range(succ_op_iterator(std::next(value_op_begin())),
-                      succ_op_iterator(value_op_end()));
+  iterator_range<succ_iterator> successors() {
+    return make_range(succ_iterator(std::next(op_begin())),
+                      succ_iterator(op_end()));
   }
 
-  iterator_range<const_succ_op_iterator> successors() const {
-    return make_range(const_succ_op_iterator(std::next(value_op_begin())),
-                      const_succ_op_iterator(value_op_end()));
+  iterator_range<const_succ_iterator> successors() const {
+    return make_range(const_succ_iterator(std::next(op_begin())),
+                      const_succ_iterator(op_end()));
   }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
@@ -4002,6 +3937,15 @@ class InvokeInst : public CallBase {
 
   unsigned getNumSuccessors() const { return 2; }
 
+  iterator_range<succ_iterator> successors() {
+    Use *First = &Op<NormalDestOpEndIdx>();
+    return {succ_iterator(First), succ_iterator(First + 2)};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    const Use *First = &Op<NormalDestOpEndIdx>();
+    return {const_succ_iterator(First), const_succ_iterator(First + 2)};
+  }
+
   /// Updates profile metadata by scaling it by \p S / \p T.
   LLVM_ABI void updateProfWeight(uint64_t S, uint64_t T);
 
@@ -4179,6 +4123,15 @@ class CallBrInst : public CallBase {
 
   unsigned getNumSuccessors() const { return getNumIndirectDests() + 1; }
 
+  iterator_range<succ_iterator> successors() {
+    Use *First = &Op<-1>() - getNumIndirectDests() - 1;
+    return {succ_iterator(First), succ_iterator(&Op<-1>())};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    const Use *First = &Op<-1>() - getNumIndirectDests() - 1;
+    return {const_succ_iterator(First), const_succ_iterator(&Op<-1>())};
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::CallBr);
@@ -4256,6 +4209,13 @@ class ResumeInst : public Instruction {
   void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
     llvm_unreachable("ResumeInst has no successors!");
   }
+
+  iterator_range<succ_iterator> successors() {
+    return {succ_iterator(op_end()), succ_iterator(op_end())};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    return {const_succ_iterator(op_end()), const_succ_iterator(op_end())};
+  }
 };
 
 template <>
@@ -4418,6 +4378,14 @@ class CatchSwitchInst : public Instruction {
     setOperand(Idx + 1, NewSucc);
   }
 
+  iterator_range<succ_iterator> successors() {
+    return {succ_iterator(std::next(op_begin())), succ_iterator(op_end())};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    return {const_succ_iterator(std::next(op_begin())),
+            const_succ_iterator(op_end())};
+  }
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const Instruction *I) {
     return I->getOpcode() == Instruction::CatchSwitch;
@@ -4567,6 +4535,14 @@ class CatchReturnInst : public Instruction {
     assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!");
     setSuccessor(B);
   }
+
+  iterator_range<succ_iterator> successors() {
+    return {succ_iterator(std::next(op_begin())), succ_iterator(op_end())};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    return {const_succ_iterator(std::next(op_begin())),
+            const_succ_iterator(op_end())};
+  }
 };
 
 template <>
@@ -4654,6 +4630,14 @@ class CleanupReturnInst : public Instruction {
     setUnwindDest(B);
   }
 
+  iterator_range<succ_iterator> successors() {
+    return {succ_iterator(std::next(op_begin())), succ_iterator(op_end())};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    return {const_succ_iterator(std::next(op_begin())),
+            const_succ_iterator(op_end())};
+  }
+
   // Shadow Instruction::setInstructionSubclassData with a private forwarding
   // method so that subclasses cannot accidentally use it.
   template <typename Bitfield>
@@ -4716,6 +4700,13 @@ class UnreachableInst : public Instruction {
   void setSuccessor(unsigned idx, BasicBlock *B) {
     llvm_unreachable("UnreachableInst has no successors!");
   }
+
+  iterator_range<succ_iterator> successors() {
+    return {succ_iterator(op_end()), succ_iterator(op_end())};
+  }
+  iterator_range<const_succ_iterator> successors() const {
+    return {const_succ_iterator(op_end()), const_succ_iterator(op_end())};
+  }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 6574f76dd85eb..facea14cea9b1 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -23,7 +23,7 @@ let TargetPrefix = "spv" in {
   def int_spv_init_global : Intrinsic<[], [llvm_any_ty, llvm_any_ty]>;
   def int_spv_unref_global : Intrinsic<[], [llvm_any_ty]>;
 
-  def int_spv_gep : Intrinsic<[llvm_anyptr_ty], [llvm_i1_ty, llvm_any_ty, llvm_vararg_ty], [ImmArg<ArgIndex<0>>]>;
+  def int_spv_gep : Intrinsic<[llvm_any_ty], [llvm_i1_ty, llvm_any_ty, llvm_vararg_ty], [ImmArg<ArgIndex<0>>]>;
   def int_spv_load : Intrinsic<[llvm_i32_ty], [llvm_anyptr_ty, llvm_i16_ty, llvm_i32_ty], [ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
   def int_spv_store : Intrinsic<[], [llvm_any_ty, llvm_anyptr_ty, llvm_i16_ty, llvm_i32_ty], [ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
   def int_spv_extractv : Intrinsic<[llvm_any_ty], [llvm_i32_ty, llvm_vararg_ty]>;
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 44defe2245152..226ae622add70 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -154,43 +154,16 @@ inline class_match<IntrinsicInst> m_AnyIntrinsic() {
 }
 
 struct undef_match {
+private:
+  static bool checkAggregate(const ConstantAggregate *CA);
+
+public:
   static bool check(const Value *V) {
     if (isa<UndefValue>(V))
       return true;
-
-    const auto *CA = dyn_cast<ConstantAggregate>(V);
-    if (!CA)
-      return false;
-
-    SmallPtrSet<const ConstantAggregate *, 8> Seen;
-    SmallVector<const ConstantAggregate *, 8> Worklist;
-
-    // Either UndefValue, PoisonValue, or an aggregate that only contains
-    // these is accepted by matcher.
-    // CheckValue returns false if CA cannot satisfy this constraint.
-    auto CheckValue = [&](const ConstantAggregate *CA) {
-      for (const Value *Op : CA->operand_values()) {
-        if (isa<UndefValue>(Op))
-          continue;
-
-        const auto *CA = dyn_cast<ConstantAggregate>(Op);
-        if (!CA)
-          return false;
-        if (Seen.insert(CA).second)
-          Worklist.emplace_back(CA);
-      }
-
-      return true;
-    };
-
-    if (!CheckValue(CA))
-      return false;
-
-    while (!Worklist.empty()) {
-      if (!CheckValue(Worklist.pop_back_val()))
-        return false;
-    }
-    return true;
+    if (const auto *CA = dyn_cast<ConstantAggregate>(V))
+      return checkAggregate(CA);
+    return false;
   }
   template <typename ITy> bool match(ITy *V) const { return check(V); }
 };
@@ -402,38 +375,44 @@ template <int64_t Val> inline constantint_match<Val> m_ConstantInt() {
 /// is true.
 template <typename Predicate, typename ConstantVal, bool AllowPoison>
 struct cstval_pred_ty : public Predicate {
+private:
+  bool matchVector(const Value *V) const {
+    if (const auto *C = dyn_cast<Constant>(V)) {
+      if (const auto *CV = dyn_cast_or_null<ConstantVal>(C->getSplatValue()))
+        return this->isValue(CV->getValue());
+
+      // Number of elements of a scalable vector unknown at compile time
+      auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
+      if (!FVTy)
+        return false;
+
+      // Non-splat vector constant: check each element for a match.
+      unsigned NumElts = FVTy->getNumElements();
+      assert(NumElts != 0 && "Constant vector with no elements?");
+      bool HasNonPoisonElements = false;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        Constant *Elt = C->getAggregateElement(i);
+        if (!Elt)
+          return false;
+        if (AllowPoison && isa<PoisonValue>(Elt))
+          continue;
+        auto *CV = dyn_cast<ConstantVal>(Elt);
+        if (!CV || !this->isValue(CV->getValue()))
+          return false;
+        HasNonPoisonElements = true;
+      }
+      return HasNonPoisonElements;
+    }
+    return false;
+  }
+
+public:
   const Constant **Res = nullptr;
   template <typename ITy> bool match_impl(ITy *V) const {
     if (const auto *CV = dyn_cast<ConstantVal>(V))
       return this->isValue(CV->getValue());
-    if (const auto *VTy = dyn_cast<VectorType>(V->getType())) {
-      if (const auto *C = dyn_cast<Constant>(V)) {
-        if (const auto *CV = dyn_cast_or_null<ConstantVal>(C->getSplatValue()))
-          return this->isValue(CV->getValue());
-
-        // Number of elements of a scalable vector unknown at compile time
-        auto *FVTy = dyn_cast<FixedVectorType>(VTy);
-        if (!FVTy)
-          return false;
-
-        // Non-splat vector constant: check each element for a match.
-        unsigned NumElts = FVTy->getNumElements();
-        assert(NumElts != 0 && "Constant vector with no elements?");
-        bool HasNonPoisonElements = false;
-        for (unsigned i = 0; i != NumElts; ++i) {
-          Constant *Elt = C->getAggregateElement(i);
-          if (!Elt)
-            return false;
-          if (AllowPoison && isa<PoisonValue>(Elt))
-            continue;
-          auto *CV = dyn_cast<ConstantVal>(Elt);
-          if (!CV || !this->isValue(CV->getValue()))
-            return false;
-          HasNonPoisonElements = true;
-        }
-        return HasNonPoisonElements;
-      }
-    }
+    if (isa<VectorType>(V->getType()))
+      return matchVector(V);
     return false;
   }
 
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 44db4ef163a64..4217d797cdf28 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -26,6 +26,7 @@
 
 namespace llvm {
 
+class ByteType;
 class IntegerType;
 struct fltSemantics;
 class LLVMContext;
@@ -68,6 +69,7 @@ class Type {
 
     // Derived types... see DerivedTypes.h file.
     IntegerTyID,        ///< Arbitrary bit width integers
+    ByteTyID,           ///< Arbitrary bit width bytes
     FunctionTyID,       ///< Functions
     PointerTyID,        ///< Pointers
     StructTyID,         ///< Structures
@@ -236,6 +238,21 @@ class Type {
   /// Returns true if this is 'token' or a token-like target type.s
   LLVM_ABI bool isTokenLikeTy() const;
 
+  /// True if this is an instance of ByteType.
+  bool isByteTy() const { return getTypeID() == ByteTyID; }
+
+  /// Return true if this is a ByteType of the given width.
+  LLVM_ABI bool isByteTy(unsigned BitWidth) const;
+
+  /// Return true if this is a byte type or a vector of byte types.
+  bool isByteOrByteVectorTy() const { return getScalarType()->isByteTy(); }
+
+  /// Return true if this is a byte type or a vector of byte types of
+  /// the given width.
+  bool isByteOrByteVectorTy(unsigned BitWidth) const {
+    return getScalarType()->isByteTy(BitWidth);
+  }
+
   /// True if this is an instance of IntegerType.
   bool isIntegerTy() const { return getTypeID() == IntegerTyID; }
 
@@ -295,7 +312,7 @@ class Type {
   /// includes all first-class types except struct and array types.
   bool isSingleValueType() const {
     return isFloatingPointTy() || isIntegerTy() || isPointerTy() ||
-           isVectorTy() || isX86_AMXTy() || isTargetExtTy();
+           isVectorTy() || isX86_AMXTy() || isTargetExtTy() || isByteTy();
   }
 
   /// Return true if the type is an aggregate type. This means it is valid as
@@ -311,7 +328,8 @@ class Type {
   bool isSized(SmallPtrSetImpl<Type*> *Visited = nullptr) const {
     // If it's a primitive, it is always sized.
     if (getTypeID() == IntegerTyID || isFloatingPointTy() ||
-        getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID)
+        getTypeID() == PointerTyID || getTypeID() == X86_AMXTyID ||
+        getTypeID() == ByteTyID)
       return true;
     // If it is not something that can have a size (e.g. a function or label),
     // it doesn't have a size.
@@ -394,6 +412,7 @@ class Type {
   // methods should not be added here.
 
   LLVM_ABI inline unsigned getIntegerBitWidth() const;
+  LLVM_ABI inline unsigned getByteBitWidth() const;
 
   LLVM_ABI inline Type *getFunctionParamType(unsigned i) const;
   LLVM_ABI inline unsigned getFunctionNumParams() const;
@@ -451,6 +470,13 @@ class Type {
   LLVM_ABI static Type *getPPC_FP128Ty(LLVMContext &C);
   LLVM_ABI static Type *getX86_AMXTy(LLVMContext &C);
   LLVM_ABI static Type *getTokenTy(LLVMContext &C);
+  LLVM_ABI static ByteType *getByteNTy(LLVMContext &C, unsigned N);
+  LLVM_ABI static ByteType *getByte1Ty(LLVMContext &C);
+  LLVM_ABI static ByteType *getByte8Ty(LLVMContext &C);
+  LLVM_ABI static ByteType *getByte16Ty(LLVMContext &C);
+  LLVM_ABI static ByteType *getByte32Ty(LLVMContext &C);
+  LLVM_ABI static ByteType *getByte64Ty(LLVMContext &C);
+  LLVM_ABI static ByteType *getByte128Ty(LLVMContext &C);
   LLVM_ABI static IntegerType *getIntNTy(LLVMContext &C, unsigned N);
   LLVM_ABI static IntegerType *getInt1Ty(LLVMContext &C);
   LLVM_ABI static IntegerType *getInt8Ty(LLVMContext &C);
@@ -475,6 +501,17 @@ class Type {
   LLVM_ABI static Type *getFloatingPointTy(LLVMContext &C,
                                            const fltSemantics &S);
 
+  //===--------------------------------------------------------------------===//
+  // Convenience methods for getting byte/integer types.
+  //
+  /// Returns an integer (vector of integer) type with the same size of a byte
+  /// of the given byte (vector of byte) type.
+  LLVM_ABI static Type *getIntFromByteType(Type *);
+
+  /// Returns a byte (vector of byte) type with the same size of an integer of
+  /// the given integer (vector of integer) type.
+  LLVM_ABI static Type *getByteFromIntType(Type *);
+
   //===--------------------------------------------------------------------===//
   // Convenience methods for getting pointer types.
   //
diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def
index 34b8d4967b28a..96160fbe3bcb4 100644
--- a/llvm/include/llvm/IR/Value.def
+++ b/llvm/include/llvm/IR/Value.def
@@ -82,6 +82,7 @@ HANDLE_CONSTANT(ConstantAggregateZero)
 HANDLE_CONSTANT(ConstantDataArray)
 HANDLE_CONSTANT(ConstantDataVector)
 HANDLE_CONSTANT(ConstantInt)
+HANDLE_CONSTANT(ConstantByte)
 HANDLE_CONSTANT(ConstantFP)
 HANDLE_CONSTANT(ConstantTargetNone)
 HANDLE_CONSTANT(ConstantPointerNull)
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index b46fabb14a04d..48e4ecd8ee2af 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -337,7 +337,7 @@ LLVM_ABI void initializeTypePromotionLegacyPass(PassRegistry &);
 LLVM_ABI void initializeInitUndefLegacyPass(PassRegistry &);
 LLVM_ABI void initializeUniformityInfoWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeUnifyLoopExitsLegacyPassPass(PassRegistry &);
-LLVM_ABI void initializeUnpackMachineBundlesPass(PassRegistry &);
+LLVM_ABI void initializeUnpackMachineBundlesLegacyPass(PassRegistry &);
 LLVM_ABI void initializeUnreachableBlockElimLegacyPassPass(PassRegistry &);
 LLVM_ABI void initializeUnreachableMachineBlockElimLegacyPass(PassRegistry &);
 LLVM_ABI void initializeVerifierLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/MC/MCPseudoProbe.h b/llvm/include/llvm/MC/MCPseudoProbe.h
index fc722378b586a..3799d2d84631d 100644
--- a/llvm/include/llvm/MC/MCPseudoProbe.h
+++ b/llvm/include/llvm/MC/MCPseudoProbe.h
@@ -430,6 +430,14 @@ class MCPseudoProbeDecoder {
   ErrorOr<StringRef> readString(uint32_t Size);
 
 public:
+  // MCPseudoProbeDecoder cannot be copied/moved due to address dependence on
+  // the DummyInlineRoot member address.
+  MCPseudoProbeDecoder() = default;
+  MCPseudoProbeDecoder(const MCPseudoProbeDecoder &) = delete;
+  MCPseudoProbeDecoder(MCPseudoProbeDecoder &&) = delete;
+  MCPseudoProbeDecoder &operator=(const MCPseudoProbeDecoder &) = delete;
+  MCPseudoProbeDecoder &operator=(MCPseudoProbeDecoder &&) = delete;
+
   using Uint64Set = DenseSet<uint64_t>;
   using Uint64Map = DenseMap<uint64_t, uint64_t>;
 
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 82ca701aad36c..98fa48604e7a2 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -172,6 +172,7 @@ MACHINE_FUNCTION_PASS("stack-slot-coloring", StackSlotColoringPass())
 MACHINE_FUNCTION_PASS("tailduplication", TailDuplicatePass())
 MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MACHINE_FUNCTION_PASS("two-address-instruction", TwoAddressInstructionPass())
+MACHINE_FUNCTION_PASS("unpack-mi-bundles", UnpackMachineBundlesPass())
 MACHINE_FUNCTION_PASS("unreachable-mbb-elimination", 
                       UnreachableMachineBlockElimPass())
 MACHINE_FUNCTION_PASS("verify", MachineVerifierPass())
@@ -295,5 +296,4 @@ DUMMY_MACHINE_FUNCTION_PASS("regallocscoringpass", RegAllocScoringPass)
 DUMMY_MACHINE_FUNCTION_PASS("regbankselect", RegBankSelectPass)
 DUMMY_MACHINE_FUNCTION_PASS("reset-machine-function", ResetMachineFunctionPass)
 DUMMY_MACHINE_FUNCTION_PASS("stackmap-liveness", StackMapLivenessPass)
-DUMMY_MACHINE_FUNCTION_PASS("unpack-mi-bundles", UnpackMachineBundlesPass)
 #undef DUMMY_MACHINE_FUNCTION_PASS
diff --git a/llvm/include/llvm/SandboxIR/Instruction.h b/llvm/include/llvm/SandboxIR/Instruction.h
index 95e59a49125c5..1c347b49bbb4c 100644
--- a/llvm/include/llvm/SandboxIR/Instruction.h
+++ b/llvm/include/llvm/SandboxIR/Instruction.h
@@ -1061,9 +1061,9 @@ class BranchInst : public SingleLLVMInstructionImpl<llvm::BranchInst> {
 
 public:
   using sb_succ_op_iterator =
-      mapped_iterator<llvm::BranchInst::succ_op_iterator, LLVMBBToSBBB>;
+      mapped_iterator<llvm::BranchInst::succ_iterator, LLVMBBToSBBB>;
   iterator_range<sb_succ_op_iterator> successors() {
-    iterator_range<llvm::BranchInst::succ_op_iterator> LLVMRange =
+    iterator_range<llvm::BranchInst::succ_iterator> LLVMRange =
         cast<llvm::BranchInst>(Val)->successors();
     LLVMBBToSBBB BBMap(Ctx);
     sb_succ_op_iterator MappedBegin = map_iterator(LLVMRange.begin(), BBMap);
@@ -1072,10 +1072,9 @@ class BranchInst : public SingleLLVMInstructionImpl<llvm::BranchInst> {
   }
 
   using const_sb_succ_op_iterator =
-      mapped_iterator<llvm::BranchInst::const_succ_op_iterator,
-                      ConstLLVMBBToSBBB>;
+      mapped_iterator<llvm::BranchInst::const_succ_iterator, ConstLLVMBBToSBBB>;
   iterator_range<const_sb_succ_op_iterator> successors() const {
-    iterator_range<llvm::BranchInst::const_succ_op_iterator> ConstLLVMRange =
+    iterator_range<llvm::BranchInst::const_succ_iterator> ConstLLVMRange =
         static_cast<const llvm::BranchInst *>(cast<llvm::BranchInst>(Val))
             ->successors();
     ConstLLVMBBToSBBB ConstBBMap(Ctx);
diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index 832b6e02daf58..920983e7bd800 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -384,11 +384,15 @@ class DominatorTreeBase {
 private:
   std::optional<unsigned> getNodeIndex(const NodeT *BB) const {
     if constexpr (GraphHasNodeNumbers<NodeT *>) {
-      // BB can be nullptr, map nullptr to index 0.
       assert(BlockNumberEpoch ==
                  GraphTraits<ParentPtr>::getNumberEpoch(Parent) &&
              "dominator tree used with outdated block numbers");
-      return BB ? GraphTraits<const NodeT *>::getNumber(BB) + 1 : 0;
+      if constexpr (IsPostDom) {
+        if (!BB)
+          return 0; // BB may be nullptr for post-dominator tree, map to 0.
+      } else
+        assert(BB && "dominator tree block must be non-null");
+      return GraphTraits<const NodeT *>::getNumber(BB) + 1;
     } else {
       if (auto It = NodeNumberMap.find(BB); It != NodeNumberMap.end())
         return It->second;
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index bc05452400458..db03fc855df5a 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -268,6 +268,7 @@ X86_FEATURE       (PPX,                "ppx")
 X86_FEATURE       (NDD,                "ndd")
 X86_FEATURE       (EGPR,               "egpr")
 X86_FEATURE       (ZU,                 "zu")
+X86_FEATURE       (JMPABS,             "jmpabs")
 
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
diff --git a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
index 5a682e8c7b5eb..d70d281ce7267 100644
--- a/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
+++ b/llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h
@@ -198,7 +198,7 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
 
   Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList);
   Cost estimateSwitchInst(SwitchInst &I);
-  Cost estimateBranchInst(BranchInst &I);
+  Cost estimateCondBrInst(CondBrInst &I);
 
   // Transitively Incoming Values (TIV) is a set of Values that can "feed" a
   // value to the initial PHI-node. It is defined like this:
diff --git a/llvm/include/llvm/Transforms/IPO/IROutliner.h b/llvm/include/llvm/Transforms/IPO/IROutliner.h
index ed2754fabbd05..dccbe5a2038e0 100644
--- a/llvm/include/llvm/Transforms/IPO/IROutliner.h
+++ b/llvm/include/llvm/Transforms/IPO/IROutliner.h
@@ -370,7 +370,8 @@ class IROutliner {
   struct InstructionAllowed : public InstVisitor<InstructionAllowed, bool> {
     InstructionAllowed() = default;
 
-    bool visitBranchInst(BranchInst &BI) { return EnableBranches; }
+    bool visitUncondBrInst(UncondBrInst &BI) { return EnableBranches; }
+    bool visitCondBrInst(CondBrInst &BI) { return EnableBranches; }
     bool visitPHINode(PHINode &PN) { return EnableBranches; }
     // TODO: Handle allocas.
     bool visitAllocaInst(AllocaInst &AI) { return false; }
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index bc0f108ac8260..f79896e993d9b 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -37,8 +37,8 @@ class AAResults;
 class AssumeInst;
 class AssumptionCache;
 class BasicBlock;
-class BranchInst;
 class CallInst;
+class CondBrInst;
 class ExtractValueInst;
 class Function;
 class FunctionPass;
@@ -401,7 +401,7 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   bool
   propagateEquality(Value *LHS, Value *RHS,
                     const std::variant<BasicBlockEdge, Instruction *> &Root);
-  bool processFoldableCondBr(BranchInst *BI);
+  bool processFoldableCondBr(CondBrInst *BI);
   void addDeadBlock(BasicBlock *BB);
   void assignValNumForDeadCode();
   void assignBlockRPONumber(Function &F);
diff --git a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
index 1a19eb94e60ea..348cefb4360e4 100644
--- a/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/llvm/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -31,7 +31,7 @@ namespace llvm {
 class AAResults;
 class BasicBlock;
 class BinaryOperator;
-class BranchInst;
+class CondBrInst;
 class CmpInst;
 class Constant;
 class Function;
@@ -173,7 +173,7 @@ class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
 
   LLVM_ABI bool processGuards(BasicBlock *BB);
   LLVM_ABI bool threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
-                            BranchInst *BI);
+                            CondBrInst *BI);
 
 private:
   BasicBlock *splitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 8a80e7f1d8e9b..460dcdd8a2c00 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -26,7 +26,7 @@
 #include <cassert>
 
 namespace llvm {
-class BranchInst;
+class CondBrInst;
 class LandingPadInst;
 class Loop;
 class PHINode;
@@ -603,7 +603,7 @@ LLVM_ABI void SplitBlockAndInsertForEachLane(
 ///
 /// This does no checking to see if the true/false blocks have large or unsavory
 /// instructions in them.
-LLVM_ABI BranchInst *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
+LLVM_ABI CondBrInst *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
                                     BasicBlock *&IfFalse);
 
 // Split critical edges where the source of the edge is an indirectbr
@@ -634,7 +634,7 @@ LLVM_ABI bool SplitIndirectBrCriticalEdges(Function &F,
 
 // Utility function for inverting branch condition and for swapping its
 // successors
-LLVM_ABI void InvertBranch(BranchInst *PBI, IRBuilderBase &Builder);
+LLVM_ABI void InvertBranch(CondBrInst *PBI, IRBuilderBase &Builder);
 
 // Check whether the function only has simple terminator:
 // br/brcond/unreachable/ret
diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h
index cfa06a5be79fd..4f594bb456356 100644
--- a/llvm/include/llvm/Transforms/Utils/Cloning.h
+++ b/llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -268,15 +268,9 @@ class InlineFunctionInfo {
   /// the caller.
   SmallVector<AllocaInst *, 4> StaticAllocas;
 
-  /// InlineFunction fills this in with callsites that were inlined from the
-  /// callee. This is only filled in if CG is non-null.
-  SmallVector<WeakTrackingVH, 8> InlinedCalls;
-
   /// All of the new call sites inlined into the caller.
   ///
-  /// 'InlineFunction' fills this in by scanning the inlined instructions, and
-  /// only if CG is null. If CG is non-null, instead the value handle
-  /// `InlinedCalls` above is used.
+  /// 'InlineFunction' fills this in by scanning the inlined instructions.
   SmallVector<CallBase *, 8> InlinedCallSites;
 
   Value *ConvergenceControlToken = nullptr;
@@ -288,7 +282,6 @@ class InlineFunctionInfo {
 
   void reset() {
     StaticAllocas.clear();
-    InlinedCalls.clear();
     InlinedCallSites.clear();
     ConvergenceControlToken = nullptr;
     CallSiteEHPad = nullptr;
diff --git a/llvm/include/llvm/Transforms/Utils/GuardUtils.h b/llvm/include/llvm/Transforms/Utils/GuardUtils.h
index 7ab5d9ef4f238..f2a9ee1b9260b 100644
--- a/llvm/include/llvm/Transforms/Utils/GuardUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/GuardUtils.h
@@ -14,7 +14,7 @@
 
 namespace llvm {
 
-class BranchInst;
+class CondBrInst;
 class CallInst;
 class Function;
 class Value;
@@ -32,12 +32,12 @@ void makeGuardControlFlowExplicit(Function *DeoptIntrinsic, CallInst *Guard,
 /// Given a branch we know is widenable (defined per Analysis/GuardUtils.h),
 /// widen it such that condition 'NewCond' is also known to hold on the taken
 /// path.  Branch remains widenable after transform.
-void widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond);
+void widenWidenableBranch(CondBrInst *WidenableBR, Value *NewCond);
 
 /// Given a branch we know is widenable (defined per Analysis/GuardUtils.h),
 /// *set* it's condition such that (only) 'Cond' is known to hold on the taken
 /// path and that the branch remains widenable after transform.
-void setWidenableBranchCond(BranchInst *WidenableBR, Value *Cond);
+void setWidenableBranchCond(CondBrInst *WidenableBR, Value *Cond);
 
 } // llvm
 
diff --git a/llvm/include/llvm/Transforms/Utils/Instrumentation.h b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
index 93ab8c693607f..95a985ba3f0c4 100644
--- a/llvm/include/llvm/Transforms/Utils/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
@@ -154,6 +154,7 @@ struct SanitizerCoverageOptions {
   bool TraceGep = false;
   bool Use8bitCounters = false;
   bool TracePC = false;
+  bool TracePCEntryExit = false;
   bool TracePCGuard = false;
   bool Inline8bitCounters = false;
   bool InlineBoolFlag = false;
diff --git a/llvm/include/llvm/Transforms/Utils/Local.h b/llvm/include/llvm/Transforms/Utils/Local.h
index 9acfd872e574b..8456d986957b2 100644
--- a/llvm/include/llvm/Transforms/Utils/Local.h
+++ b/llvm/include/llvm/Transforms/Utils/Local.h
@@ -33,9 +33,9 @@ class AAResults;
 class AllocaInst;
 class AssumptionCache;
 class BasicBlock;
-class BranchInst;
 class CallBase;
 class CallInst;
+class CondBrInst;
 class DIBuilder;
 class DomTreeUpdater;
 class Function;
@@ -202,7 +202,7 @@ LLVM_ABI bool FlattenCFG(BasicBlock *BB, AAResults *AA = nullptr);
 /// If this basic block is ONLY a setcc and a branch, and if a predecessor
 /// branches to us and one of our successors, fold the setcc into the
 /// predecessor and use logical operations to pick the right destination.
-LLVM_ABI bool foldBranchToCommonDest(BranchInst *BI,
+LLVM_ABI bool foldBranchToCommonDest(CondBrInst *BI,
                                      llvm::DomTreeUpdater *DTU = nullptr,
                                      MemorySSAUpdater *MSSAU = nullptr,
                                      const TargetTransformInfo *TTI = nullptr,
diff --git a/llvm/include/llvm/Transforms/Utils/LoopConstrainer.h b/llvm/include/llvm/Transforms/Utils/LoopConstrainer.h
index 64db907e9a0f4..ca3a928b92c11 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopConstrainer.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopConstrainer.h
@@ -16,7 +16,7 @@
 namespace llvm {
 
 class BasicBlock;
-class BranchInst;
+class CondBrInst;
 class DominatorTree;
 class IntegerType;
 class Loop;
@@ -39,7 +39,7 @@ struct LoopStructure {
 
   // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
   // successor is `LatchExit', the exit block of the loop.
-  BranchInst *LatchBr = nullptr;
+  CondBrInst *LatchBr = nullptr;
   BasicBlock *LatchExit = nullptr;
   unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
 
@@ -67,7 +67,7 @@ struct LoopStructure {
     Result.Tag = Tag;
     Result.Header = cast<BasicBlock>(Map(Header));
     Result.Latch = cast<BasicBlock>(Map(Latch));
-    Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+    Result.LatchBr = cast<CondBrInst>(Map(LatchBr));
     Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
     Result.LatchBrExitIdx = LatchBrExitIdx;
     Result.IndVarBase = Map(IndVarBase);
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index 1e48eeca72952..ccba9ee16885b 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -403,7 +403,7 @@ bool setLoopProbability(Loop *L, BranchProbability P);
 /// - The probability \c P that control flows from \p B to its first target
 ///   label such that `1 - P` is the probability of control flowing to its
 ///   second target label, or vice-versa if \p ForFirstTarget is false.
-BranchProbability getBranchProbability(BranchInst *B, bool ForFirstTarget);
+BranchProbability getBranchProbability(CondBrInst *B, bool ForFirstTarget);
 
 /// Calculates the edge probability from Src to Dst.
 /// Dst has to be a successor to Src.
@@ -416,11 +416,8 @@ BranchProbability getBranchProbability(BasicBlock *Src, BasicBlock *Dst);
 
 /// Set branch weight metadata for \p B to indicate that \p P and `1 - P` are
 /// the probabilities of control flowing to its first and second target labels,
-/// respectively, or vice-versa if \p ForFirstTarget is false.  Return false if
-/// the implementation cannot set the probability (e.g., \p B must have exactly
-/// two target labels, so it must be a conditional branch).  Otherwise, return
-/// true.
-bool setBranchProbability(BranchInst *B, BranchProbability P,
+/// respectively, or vice-versa if \p ForFirstTarget is false.
+void setBranchProbability(CondBrInst *B, BranchProbability P,
                           bool ForFirstTarget);
 
 /// Check inner loop (L) backedge count is known to be invariant on all
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index 49c6bc8191c86..d98cb2b0d1dcb 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -210,7 +210,199 @@ enum class BlockExecWeight : std::uint32_t {
   DEFAULT = 0xfffff
 };
 
-BranchProbabilityInfo::SccInfo::SccInfo(const Function &F) {
+namespace {
+class BPIConstruction {
+public:
+  BPIConstruction(BranchProbabilityInfo &BPI) : BPI(BPI) {}
+  void calculate(const Function &F, const LoopInfo &LI,
+                 const TargetLibraryInfo *TLI, DominatorTree *DT,
+                 PostDominatorTree *PDT);
+
+private:
+  // Data structure to track SCCs for handling irreducible loops.
+  class SccInfo {
+    // Enum of types to classify basic blocks in SCC. Basic block belonging to
+    // SCC is 'Inner' until it is either 'Header' or 'Exiting'. Note that a
+    // basic block can be 'Header' and 'Exiting' at the same time.
+    enum SccBlockType {
+      Inner = 0x0,
+      Header = 0x1,
+      Exiting = 0x2,
+    };
+    // Map of basic blocks to SCC IDs they belong to. If basic block doesn't
+    // belong to any SCC it is not in the map.
+    using SccMap = DenseMap<const BasicBlock *, int>;
+    // Each basic block in SCC is attributed with one or several types from
+    // SccBlockType. Map value has uint32_t type (instead of SccBlockType)
+    // since basic block may be for example "Header" and "Exiting" at the same
+    // time and we need to be able to keep more than one value from
+    // SccBlockType.
+    using SccBlockTypeMap = DenseMap<const BasicBlock *, uint32_t>;
+    // Vector containing classification of basic blocks for all  SCCs where i'th
+    // vector element corresponds to SCC with ID equal to i.
+    using SccBlockTypeMaps = std::vector<SccBlockTypeMap>;
+
+    SccMap SccNums;
+    SccBlockTypeMaps SccBlocks;
+
+  public:
+    LLVM_ABI explicit SccInfo(const Function &F);
+
+    /// If \p BB belongs to some SCC then ID of that SCC is returned, otherwise
+    /// -1 is returned. If \p BB belongs to more than one SCC at the same time
+    /// result is undefined.
+    LLVM_ABI int getSCCNum(const BasicBlock *BB) const;
+    /// Returns true if \p BB is a 'header' block in SCC with \p SccNum ID,
+    /// false otherwise.
+    bool isSCCHeader(const BasicBlock *BB, int SccNum) const {
+      return getSccBlockType(BB, SccNum) & Header;
+    }
+    /// Returns true if \p BB is an 'exiting' block in SCC with \p SccNum ID,
+    /// false otherwise.
+    bool isSCCExitingBlock(const BasicBlock *BB, int SccNum) const {
+      return getSccBlockType(BB, SccNum) & Exiting;
+    }
+    /// Fills in \p Enters vector with all such blocks that don't belong to
+    /// SCC with \p SccNum ID but there is an edge to a block belonging to the
+    /// SCC.
+    LLVM_ABI void
+    getSccEnterBlocks(int SccNum, SmallVectorImpl<BasicBlock *> &Enters) const;
+    /// Fills in \p Exits vector with all such blocks that don't belong to
+    /// SCC with \p SccNum ID but there is an edge from a block belonging to the
+    /// SCC.
+    LLVM_ABI void getSccExitBlocks(int SccNum,
+                                   SmallVectorImpl<BasicBlock *> &Exits) const;
+
+  private:
+    /// Returns \p BB's type according to classification given by SccBlockType
+    /// enum. Please note that \p BB must belong to SSC with \p SccNum ID.
+    LLVM_ABI uint32_t getSccBlockType(const BasicBlock *BB, int SccNum) const;
+    /// Calculates \p BB's type and stores it in internal data structures for
+    /// future use. Please note that \p BB must belong to SSC with \p SccNum ID.
+    void calculateSccBlockType(const BasicBlock *BB, int SccNum);
+  };
+
+  /// Pair of Loop and SCC ID number. Used to unify handling of normal and
+  /// SCC based loop representations.
+  using LoopData = std::pair<Loop *, int>;
+  /// Helper class to keep basic block along with its loop data information.
+  class LoopBlock {
+  public:
+    LLVM_ABI explicit LoopBlock(const BasicBlock *BB, const LoopInfo &LI,
+                                const SccInfo &SccI);
+
+    const BasicBlock *getBlock() const { return BB; }
+    BasicBlock *getBlock() { return const_cast<BasicBlock *>(BB); }
+    LoopData getLoopData() const { return LD; }
+    Loop *getLoop() const { return LD.first; }
+    int getSccNum() const { return LD.second; }
+
+    bool belongsToLoop() const { return getLoop() || getSccNum() != -1; }
+    bool belongsToSameLoop(const LoopBlock &LB) const {
+      return (LB.getLoop() && getLoop() == LB.getLoop()) ||
+             (LB.getSccNum() != -1 && getSccNum() == LB.getSccNum());
+    }
+
+  private:
+    const BasicBlock *const BB = nullptr;
+    LoopData LD = {nullptr, -1};
+  };
+
+  // Pair of LoopBlocks representing an edge from first to second block.
+  using LoopEdge = std::pair<const LoopBlock &, const LoopBlock &>;
+
+  /// Helper to construct LoopBlock for \p BB.
+  LoopBlock getLoopBlock(const BasicBlock *BB) const {
+    return LoopBlock(BB, *LI, *SccI);
+  }
+
+  /// Returns true if destination block belongs to some loop and source block is
+  /// either doesn't belong to any loop or belongs to a loop which is not inner
+  /// relative to the destination block.
+  bool isLoopEnteringEdge(const LoopEdge &Edge) const;
+  /// Returns true if source block belongs to some loop and destination block is
+  /// either doesn't belong to any loop or belongs to a loop which is not inner
+  /// relative to the source block.
+  bool isLoopExitingEdge(const LoopEdge &Edge) const;
+  /// Returns true if \p Edge is either enters to or exits from some loop, false
+  /// in all other cases.
+  bool isLoopEnteringExitingEdge(const LoopEdge &Edge) const;
+  /// Returns true if source and destination blocks belongs to the same loop and
+  /// destination block is loop header.
+  bool isLoopBackEdge(const LoopEdge &Edge) const;
+  // Fills in \p Enters vector with all "enter" blocks to a loop \LB belongs to.
+  void getLoopEnterBlocks(const LoopBlock &LB,
+                          SmallVectorImpl<BasicBlock *> &Enters) const;
+  // Fills in \p Exits vector with all "exit" blocks from a loop \LB belongs to.
+  void getLoopExitBlocks(const LoopBlock &LB,
+                         SmallVectorImpl<BasicBlock *> &Exits) const;
+
+  /// Returns estimated weight for \p BB. std::nullopt if \p BB has no estimated
+  /// weight.
+  std::optional<uint32_t> getEstimatedBlockWeight(const BasicBlock *BB) const;
+
+  /// Returns estimated weight to enter \p L. In other words it is weight of
+  /// loop's header block not scaled by trip count. Returns std::nullopt if \p L
+  /// has no no estimated weight.
+  std::optional<uint32_t> getEstimatedLoopWeight(const LoopData &L) const;
+
+  /// Return estimated weight for \p Edge. Returns std::nullopt if estimated
+  /// weight is unknown.
+  std::optional<uint32_t> getEstimatedEdgeWeight(const LoopEdge &Edge) const;
+
+  /// Iterates over all edges leading from \p SrcBB to \p Successors and
+  /// returns maximum of all estimated weights. If at least one edge has unknown
+  /// estimated weight std::nullopt is returned.
+  template <class IterT>
+  std::optional<uint32_t>
+  getMaxEstimatedEdgeWeight(const LoopBlock &SrcBB,
+                            iterator_range<IterT> Successors) const;
+
+  /// If \p LoopBB has no estimated weight then set it to \p BBWeight and
+  /// return true. Otherwise \p BB's weight remains unchanged and false is
+  /// returned. In addition all blocks/loops that might need their weight to be
+  /// re-estimated are put into BlockWorkList/LoopWorkList.
+  bool updateEstimatedBlockWeight(LoopBlock &LoopBB, uint32_t BBWeight,
+                                  SmallVectorImpl<BasicBlock *> &BlockWorkList,
+                                  SmallVectorImpl<LoopBlock> &LoopWorkList);
+
+  /// Starting from \p LoopBB (including \p LoopBB itself) propagate \p BBWeight
+  /// up the domination tree.
+  void propagateEstimatedBlockWeight(const LoopBlock &LoopBB, DominatorTree *DT,
+                                     PostDominatorTree *PDT, uint32_t BBWeight,
+                                     SmallVectorImpl<BasicBlock *> &WorkList,
+                                     SmallVectorImpl<LoopBlock> &LoopWorkList);
+
+  /// Returns block's weight encoded in the IR.
+  std::optional<uint32_t> getInitialEstimatedBlockWeight(const BasicBlock *BB);
+
+  // Computes estimated weights for all blocks in \p F.
+  void estimateBlockWeights(const Function &F, DominatorTree *DT,
+                            PostDominatorTree *PDT);
+
+  /// Based on computed weights by \p computeEstimatedBlockWeight set
+  /// probabilities on branches.
+  bool calcEstimatedHeuristics(const BasicBlock *BB);
+  bool calcMetadataWeights(const BasicBlock *BB);
+  bool calcPointerHeuristics(const BasicBlock *BB);
+  bool calcZeroHeuristics(const BasicBlock *BB, const TargetLibraryInfo *TLI);
+  bool calcFloatingPointHeuristics(const BasicBlock *BB);
+
+  BranchProbabilityInfo &BPI;
+
+  const LoopInfo *LI = nullptr;
+
+  /// Keeps information about all SCCs in a function.
+  std::unique_ptr<const SccInfo> SccI;
+
+  /// Keeps mapping of a basic block to its estimated weight.
+  SmallDenseMap<const BasicBlock *, uint32_t> EstimatedBlockWeight;
+
+  /// Keeps mapping of a loop to estimated weight to enter the loop.
+  SmallDenseMap<LoopData, uint32_t> EstimatedLoopWeight;
+};
+
+BPIConstruction::SccInfo::SccInfo(const Function &F) {
   // Record SCC numbers of blocks in the CFG to identify irreducible loops.
   // FIXME: We could only calculate this if the CFG is known to be irreducible
   // (perhaps cache this info in LoopInfo if we can easily calculate it there?).
@@ -233,14 +425,14 @@ BranchProbabilityInfo::SccInfo::SccInfo(const Function &F) {
   }
 }
 
-int BranchProbabilityInfo::SccInfo::getSCCNum(const BasicBlock *BB) const {
+int BPIConstruction::SccInfo::getSCCNum(const BasicBlock *BB) const {
   auto SccIt = SccNums.find(BB);
   if (SccIt == SccNums.end())
     return -1;
   return SccIt->second;
 }
 
-void BranchProbabilityInfo::SccInfo::getSccEnterBlocks(
+void BPIConstruction::SccInfo::getSccEnterBlocks(
     int SccNum, SmallVectorImpl<BasicBlock *> &Enters) const {
 
   for (auto MapIt : SccBlocks[SccNum]) {
@@ -252,7 +444,7 @@ void BranchProbabilityInfo::SccInfo::getSccEnterBlocks(
   }
 }
 
-void BranchProbabilityInfo::SccInfo::getSccExitBlocks(
+void BPIConstruction::SccInfo::getSccExitBlocks(
     int SccNum, SmallVectorImpl<BasicBlock *> &Exits) const {
   for (auto MapIt : SccBlocks[SccNum]) {
     const auto *BB = MapIt.first;
@@ -263,8 +455,8 @@ void BranchProbabilityInfo::SccInfo::getSccExitBlocks(
   }
 }
 
-uint32_t BranchProbabilityInfo::SccInfo::getSccBlockType(const BasicBlock *BB,
-                                                         int SccNum) const {
+uint32_t BPIConstruction::SccInfo::getSccBlockType(const BasicBlock *BB,
+                                                   int SccNum) const {
   assert(getSCCNum(BB) == SccNum);
 
   assert(SccBlocks.size() > static_cast<unsigned>(SccNum) && "Unknown SCC");
@@ -277,8 +469,8 @@ uint32_t BranchProbabilityInfo::SccInfo::getSccBlockType(const BasicBlock *BB,
   return Inner;
 }
 
-void BranchProbabilityInfo::SccInfo::calculateSccBlockType(const BasicBlock *BB,
-                                                           int SccNum) {
+void BPIConstruction::SccInfo::calculateSccBlockType(const BasicBlock *BB,
+                                                     int SccNum) {
   assert(getSCCNum(BB) == SccNum);
   uint32_t BlockType = Inner;
 
@@ -308,9 +500,8 @@ void BranchProbabilityInfo::SccInfo::calculateSccBlockType(const BasicBlock *BB,
   }
 }
 
-BranchProbabilityInfo::LoopBlock::LoopBlock(const BasicBlock *BB,
-                                            const LoopInfo &LI,
-                                            const SccInfo &SccI)
+BPIConstruction::LoopBlock::LoopBlock(const BasicBlock *BB, const LoopInfo &LI,
+                                      const SccInfo &SccI)
     : BB(BB) {
   LD.first = LI.getLoopFor(BB);
   if (!LD.first) {
@@ -318,7 +509,7 @@ BranchProbabilityInfo::LoopBlock::LoopBlock(const BasicBlock *BB,
   }
 }
 
-bool BranchProbabilityInfo::isLoopEnteringEdge(const LoopEdge &Edge) const {
+bool BPIConstruction::isLoopEnteringEdge(const LoopEdge &Edge) const {
   const auto &SrcBlock = Edge.first;
   const auto &DstBlock = Edge.second;
   return (DstBlock.getLoop() &&
@@ -328,16 +519,15 @@ bool BranchProbabilityInfo::isLoopEnteringEdge(const LoopEdge &Edge) const {
           SrcBlock.getSccNum() != DstBlock.getSccNum());
 }
 
-bool BranchProbabilityInfo::isLoopExitingEdge(const LoopEdge &Edge) const {
+bool BPIConstruction::isLoopExitingEdge(const LoopEdge &Edge) const {
   return isLoopEnteringEdge({Edge.second, Edge.first});
 }
 
-bool BranchProbabilityInfo::isLoopEnteringExitingEdge(
-    const LoopEdge &Edge) const {
+bool BPIConstruction::isLoopEnteringExitingEdge(const LoopEdge &Edge) const {
   return isLoopEnteringEdge(Edge) || isLoopExitingEdge(Edge);
 }
 
-bool BranchProbabilityInfo::isLoopBackEdge(const LoopEdge &Edge) const {
+bool BPIConstruction::isLoopBackEdge(const LoopEdge &Edge) const {
   const auto &SrcBlock = Edge.first;
   const auto &DstBlock = Edge.second;
   return SrcBlock.belongsToSameLoop(DstBlock) &&
@@ -347,7 +537,7 @@ bool BranchProbabilityInfo::isLoopBackEdge(const LoopEdge &Edge) const {
            SccI->isSCCHeader(DstBlock.getBlock(), DstBlock.getSccNum())));
 }
 
-void BranchProbabilityInfo::getLoopEnterBlocks(
+void BPIConstruction::getLoopEnterBlocks(
     const LoopBlock &LB, SmallVectorImpl<BasicBlock *> &Enters) const {
   if (LB.getLoop()) {
     auto *Header = LB.getLoop()->getHeader();
@@ -358,7 +548,7 @@ void BranchProbabilityInfo::getLoopEnterBlocks(
   }
 }
 
-void BranchProbabilityInfo::getLoopExitBlocks(
+void BPIConstruction::getLoopExitBlocks(
     const LoopBlock &LB, SmallVectorImpl<BasicBlock *> &Exits) const {
   if (LB.getLoop()) {
     LB.getLoop()->getExitBlocks(Exits);
@@ -372,7 +562,7 @@ void BranchProbabilityInfo::getLoopExitBlocks(
 // 'expect' intrinsic processing. Examine metadata against unreachable
 // heuristic. The probability of the edge coming to unreachable block is
 // set to min of metadata and unreachable heuristic.
-bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
+bool BPIConstruction::calcMetadataWeights(const BasicBlock *BB) {
   const Instruction *TI = BB->getTerminator();
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
   if (!(isa<CondBrInst>(TI) || isa<SwitchInst>(TI) || isa<IndirectBrInst>(TI) ||
@@ -395,10 +585,11 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   SmallVector<unsigned, 2> ReachableIdxs;
 
   extractBranchWeights(WeightsNode, Weights);
+  auto Succs = succ_begin(TI);
   for (unsigned I = 0, E = Weights.size(); I != E; ++I) {
     WeightSum += Weights[I];
     const LoopBlock SrcLoopBB = getLoopBlock(BB);
-    const LoopBlock DstLoopBB = getLoopBlock(TI->getSuccessor(I));
+    const LoopBlock DstLoopBB = getLoopBlock(*Succs++);
     auto EstimatedWeight = getEstimatedEdgeWeight({SrcLoopBB, DstLoopBB});
     if (EstimatedWeight &&
         *EstimatedWeight <= static_cast<uint32_t>(BlockExecWeight::UNREACHABLE))
@@ -437,7 +628,7 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   // Examine the metadata against unreachable heuristic.
   // If the unreachable heuristic is more strong then we use it for this edge.
   if (UnreachableIdxs.size() == 0 || ReachableIdxs.size() == 0) {
-    setEdgeProbability(BB, BP);
+    BPI.setEdgeProbability(BB, BP);
     return true;
   }
 
@@ -501,14 +692,14 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
     }
   }
 
-  setEdgeProbability(BB, BP);
+  BPI.setEdgeProbability(BB, BP);
 
   return true;
 }
 
 // Calculate Edge Weights using "Pointer Heuristics". Predict a comparison
 // between two pointer or pointer and NULL will fail.
-bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
+bool BPIConstruction::calcPointerHeuristics(const BasicBlock *BB) {
   const CondBrInst *BI = dyn_cast<CondBrInst>(BB->getTerminator());
   if (!BI)
     return false;
@@ -528,7 +719,7 @@ bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
   auto Search = PointerTable.find(CI->getPredicate());
   if (Search == PointerTable.end())
     return false;
-  setEdgeProbability(BB, Search->second);
+  BPI.setEdgeProbability(BB, Search->second);
   return true;
 }
 
@@ -638,7 +829,7 @@ computeUnlikelySuccessors(const BasicBlock *BB, Loop *L,
 }
 
 std::optional<uint32_t>
-BranchProbabilityInfo::getEstimatedBlockWeight(const BasicBlock *BB) const {
+BPIConstruction::getEstimatedBlockWeight(const BasicBlock *BB) const {
   auto WeightIt = EstimatedBlockWeight.find(BB);
   if (WeightIt == EstimatedBlockWeight.end())
     return std::nullopt;
@@ -646,7 +837,7 @@ BranchProbabilityInfo::getEstimatedBlockWeight(const BasicBlock *BB) const {
 }
 
 std::optional<uint32_t>
-BranchProbabilityInfo::getEstimatedLoopWeight(const LoopData &L) const {
+BPIConstruction::getEstimatedLoopWeight(const LoopData &L) const {
   auto WeightIt = EstimatedLoopWeight.find(L);
   if (WeightIt == EstimatedLoopWeight.end())
     return std::nullopt;
@@ -654,7 +845,7 @@ BranchProbabilityInfo::getEstimatedLoopWeight(const LoopData &L) const {
 }
 
 std::optional<uint32_t>
-BranchProbabilityInfo::getEstimatedEdgeWeight(const LoopEdge &Edge) const {
+BPIConstruction::getEstimatedEdgeWeight(const LoopEdge &Edge) const {
   // For edges entering a loop take weight of a loop rather than an individual
   // block in the loop.
   return isLoopEnteringEdge(Edge)
@@ -663,7 +854,7 @@ BranchProbabilityInfo::getEstimatedEdgeWeight(const LoopEdge &Edge) const {
 }
 
 template <class IterT>
-std::optional<uint32_t> BranchProbabilityInfo::getMaxEstimatedEdgeWeight(
+std::optional<uint32_t> BPIConstruction::getMaxEstimatedEdgeWeight(
     const LoopBlock &SrcLoopBB, iterator_range<IterT> Successors) const {
   std::optional<uint32_t> MaxWeight;
   for (const BasicBlock *DstBB : Successors) {
@@ -685,7 +876,7 @@ std::optional<uint32_t> BranchProbabilityInfo::getMaxEstimatedEdgeWeight(
 //
 // Please note by the algorithm the weight is not expected to change once set
 // thus 'false' status is used to track visited blocks.
-bool BranchProbabilityInfo::updateEstimatedBlockWeight(
+bool BPIConstruction::updateEstimatedBlockWeight(
     LoopBlock &LoopBB, uint32_t BBWeight,
     SmallVectorImpl<BasicBlock *> &BlockWorkList,
     SmallVectorImpl<LoopBlock> &LoopWorkList) {
@@ -723,7 +914,7 @@ bool BranchProbabilityInfo::updateEstimatedBlockWeight(
 //
 // In addition, \p WorkList is populated with basic blocks if at leas one
 // successor has updated estimated weight.
-void BranchProbabilityInfo::propagateEstimatedBlockWeight(
+void BPIConstruction::propagateEstimatedBlockWeight(
     const LoopBlock &LoopBB, DominatorTree *DT, PostDominatorTree *PDT,
     uint32_t BBWeight, SmallVectorImpl<BasicBlock *> &BlockWorkList,
     SmallVectorImpl<LoopBlock> &LoopWorkList) {
@@ -757,7 +948,7 @@ void BranchProbabilityInfo::propagateEstimatedBlockWeight(
 }
 
 std::optional<uint32_t>
-BranchProbabilityInfo::getInitialEstimatedBlockWeight(const BasicBlock *BB) {
+BPIConstruction::getInitialEstimatedBlockWeight(const BasicBlock *BB) {
   // Returns true if \p BB has call marked with "NoReturn" attribute.
   auto hasNoReturn = [&](const BasicBlock *BB) {
     for (const auto &I : reverse(*BB))
@@ -797,9 +988,8 @@ BranchProbabilityInfo::getInitialEstimatedBlockWeight(const BasicBlock *BB) {
 // Does RPO traversal over all blocks in \p F and assigns weights to
 // 'unreachable', 'noreturn', 'cold', 'unwind' blocks. In addition it does its
 // best to propagate the weight to up/down the IR.
-void BranchProbabilityInfo::estimateBlockWeights(const Function &F,
-                                                 DominatorTree *DT,
-                                                 PostDominatorTree *PDT) {
+void BPIConstruction::estimateBlockWeights(const Function &F, DominatorTree *DT,
+                                           PostDominatorTree *PDT) {
   SmallVector<BasicBlock *, 8> BlockWorkList;
   SmallVector<LoopBlock, 8> LoopWorkList;
   SmallDenseMap<LoopData, SmallVector<BasicBlock *, 4>> LoopExitBlocks;
@@ -868,7 +1058,7 @@ void BranchProbabilityInfo::estimateBlockWeights(const Function &F,
 // Calculate edge probabilities based on block's estimated weight.
 // Note that gathered weights were not scaled for loops. Thus edges entering
 // and exiting loops requires special processing.
-bool BranchProbabilityInfo::calcEstimatedHeuristics(const BasicBlock *BB) {
+bool BPIConstruction::calcEstimatedHeuristics(const BasicBlock *BB) {
   assert(BB->getTerminator()->getNumSuccessors() > 1 &&
          "expected more than one successor!");
 
@@ -951,12 +1141,12 @@ bool BranchProbabilityInfo::calcEstimatedHeuristics(const BasicBlock *BB) {
     EdgeProbabilities[Idx] =
         BranchProbability(SuccWeights[Idx], (uint32_t)TotalWeight);
   }
-  setEdgeProbability(BB, EdgeProbabilities);
+  BPI.setEdgeProbability(BB, EdgeProbabilities);
   return true;
 }
 
-bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
-                                               const TargetLibraryInfo *TLI) {
+bool BPIConstruction::calcZeroHeuristics(const BasicBlock *BB,
+                                         const TargetLibraryInfo *TLI) {
   const CondBrInst *BI = dyn_cast<CondBrInst>(BB->getTerminator());
   if (!BI)
     return false;
@@ -1018,11 +1208,11 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
     return false;
   }
 
-  setEdgeProbability(BB, Search->second);
+  BPI.setEdgeProbability(BB, Search->second);
   return true;
 }
 
-bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
+bool BPIConstruction::calcFloatingPointHeuristics(const BasicBlock *BB) {
   const CondBrInst *BI = dyn_cast<CondBrInst>(BB->getTerminator());
   if (!BI)
     return false;
@@ -1046,13 +1236,86 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
     ProbList = Search->second;
   }
 
-  setEdgeProbability(BB, ProbList);
+  BPI.setEdgeProbability(BB, ProbList);
   return true;
 }
+void BPIConstruction::calculate(const Function &F, const LoopInfo &LoopI,
+                                const TargetLibraryInfo *TLI, DominatorTree *DT,
+                                PostDominatorTree *PDT) {
+  LI = &LoopI;
 
-void BranchProbabilityInfo::releaseMemory() {
-  Probs.clear();
-  Handles.clear();
+  SccI = std::make_unique<SccInfo>(F);
+
+  std::unique_ptr<DominatorTree> DTPtr;
+  std::unique_ptr<PostDominatorTree> PDTPtr;
+
+  if (!DT) {
+    DTPtr = std::make_unique<DominatorTree>(const_cast<Function &>(F));
+    DT = DTPtr.get();
+  }
+
+  if (!PDT) {
+    PDTPtr = std::make_unique<PostDominatorTree>(const_cast<Function &>(F));
+    PDT = PDTPtr.get();
+  }
+
+  estimateBlockWeights(F, DT, PDT);
+
+  // Walk the basic blocks in post-order so that we can build up state about
+  // the successors of a block iteratively.
+  for (const auto *BB : post_order(&F.getEntryBlock())) {
+    LLVM_DEBUG(dbgs() << "Computing probabilities for " << BB->getName()
+                      << "\n");
+    // If there is no at least two successors, no sense to set probability.
+    if (BB->getTerminator()->getNumSuccessors() < 2)
+      continue;
+    if (calcMetadataWeights(BB))
+      continue;
+    if (calcEstimatedHeuristics(BB))
+      continue;
+    if (calcPointerHeuristics(BB))
+      continue;
+    if (calcZeroHeuristics(BB, TLI))
+      continue;
+    if (calcFloatingPointHeuristics(BB))
+      continue;
+  }
+}
+
+} // end anonymous namespace
+
+MutableArrayRef<BranchProbability>
+BranchProbabilityInfo::allocEdges(const BasicBlock *BB) {
+  assert(BB->getParent() == LastF);
+  assert(BlockNumberEpoch == LastF->getBlockNumberEpoch());
+  unsigned NumSuccs = succ_size(BB);
+  if (NumSuccs == 0) {
+    eraseBlock(BB);
+    return {};
+  }
+  if (EdgeStarts.size() <= BB->getNumber())
+    EdgeStarts.resize(LastF->getMaxBlockNumber(), 0);
+  unsigned EdgeStart = Probs.size();
+  EdgeStarts[BB->getNumber()] = EdgeStart + 1; // 0 = no edges.
+  Probs.append(NumSuccs, {});
+  return MutableArrayRef(&Probs[EdgeStart], NumSuccs);
+}
+
+ArrayRef<BranchProbability>
+BranchProbabilityInfo::getEdges(const BasicBlock *BB) const {
+  assert(BB->getParent() == LastF);
+  assert(BlockNumberEpoch == LastF->getBlockNumberEpoch());
+  if (EdgeStarts.size() <= BB->getNumber())
+    return {};
+  if (unsigned EdgeStart = EdgeStarts[BB->getNumber()]) {
+    const BranchProbability *Start = &Probs[EdgeStart - 1]; // 0 = no edges.
+    size_t Count = SIZE_MAX; // Avoid querying num successors in release builds.
+#ifndef NDEBUG
+    Count = succ_size(BB);
+#endif
+    return ArrayRef(Start, Count);
+  }
+  return {};
 }
 
 bool BranchProbabilityInfo::invalidate(Function &, const PreservedAnalyses &PA,
@@ -1089,22 +1352,15 @@ isEdgeHot(const BasicBlock *Src, const BasicBlock *Dst) const {
 BranchProbability
 BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
                                           unsigned IndexInSuccessors) const {
-  auto I = Probs.find(std::make_pair(Src, IndexInSuccessors));
-  assert((Probs.end() == Probs.find(std::make_pair(Src, 0))) ==
-             (Probs.end() == I) &&
-         "Probability for I-th successor must always be defined along with the "
-         "probability for the first successor");
-
-  if (I != Probs.end())
-    return I->second;
-
+  if (ArrayRef<BranchProbability> P = getEdges(Src); !P.empty())
+    return P[IndexInSuccessors];
   return {1, static_cast<uint32_t>(succ_size(Src))};
 }
 
 BranchProbability
 BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
                                           const_succ_iterator Dst) const {
-  return getEdgeProbability(Src, Dst.getSuccessorIndex());
+  return getEdgeProbability(Src, std::distance(succ_begin(Src), Dst));
 }
 
 /// Get the raw edge probability calculated for the block pair. This returns the
@@ -1112,13 +1368,14 @@ BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
 BranchProbability
 BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
                                           const BasicBlock *Dst) const {
-  if (!Probs.count(std::make_pair(Src, 0)))
+  ArrayRef<BranchProbability> P = getEdges(Src);
+  if (P.empty())
     return BranchProbability(llvm::count(successors(Src), Dst), succ_size(Src));
 
   auto Prob = BranchProbability::getZero();
-  for (const_succ_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
-    if (*I == Dst)
-      Prob += Probs.find(std::make_pair(Src, I.getSuccessorIndex()))->second;
+  for (auto It : enumerate(successors(Src)))
+    if (It.value() == Dst)
+      Prob += P[It.index()];
 
   return Prob;
 }
@@ -1127,14 +1384,10 @@ BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
 void BranchProbabilityInfo::setEdgeProbability(
     const BasicBlock *Src, const SmallVectorImpl<BranchProbability> &Probs) {
   assert(Src->getTerminator()->getNumSuccessors() == Probs.size());
-  eraseBlock(Src); // Erase stale data if any.
-  if (Probs.size() == 0)
-    return; // Nothing to set.
-
-  Handles.insert(BasicBlockCallbackVH(Src, this));
+  MutableArrayRef<BranchProbability> P = allocEdges(Src);
   uint64_t TotalNumerator = 0;
   for (unsigned SuccIdx = 0; SuccIdx < Probs.size(); ++SuccIdx) {
-    this->Probs[std::make_pair(Src, SuccIdx)] = Probs[SuccIdx];
+    P[SuccIdx] = Probs[SuccIdx];
     LLVM_DEBUG(dbgs() << "set edge " << Src->getName() << " -> " << SuccIdx
                       << " successor probability to " << Probs[SuccIdx]
                       << "\n");
@@ -1146,6 +1399,8 @@ void BranchProbabilityInfo::setEdgeProbability(
   // Instead, every single probability in Probs must be as accurate as possible.
   // This results in error 1/denominator at most, thus the total absolute error
   // should be within Probs.size / BranchProbability::getDenominator.
+  if (P.empty())
+    return; // If we store no probabilities, TotalNumerator is zero.
   assert(TotalNumerator <= BranchProbability::getDenominator() + Probs.size());
   assert(TotalNumerator >= BranchProbability::getDenominator() - Probs.size());
   (void)TotalNumerator;
@@ -1153,31 +1408,30 @@ void BranchProbabilityInfo::setEdgeProbability(
 
 void BranchProbabilityInfo::copyEdgeProbabilities(BasicBlock *Src,
                                                   BasicBlock *Dst) {
-  eraseBlock(Dst); // Erase stale data if any.
-  unsigned NumSuccessors = Src->getTerminator()->getNumSuccessors();
-  assert(NumSuccessors == Dst->getTerminator()->getNumSuccessors());
-  if (NumSuccessors == 0)
-    return; // Nothing to set.
-  if (!this->Probs.contains(std::make_pair(Src, 0)))
-    return; // No probability is set for edges from Src. Keep the same for Dst.
-
-  Handles.insert(BasicBlockCallbackVH(Dst, this));
-  for (unsigned SuccIdx = 0; SuccIdx < NumSuccessors; ++SuccIdx) {
-    auto Prob = this->Probs[std::make_pair(Src, SuccIdx)];
-    this->Probs[std::make_pair(Dst, SuccIdx)] = Prob;
-    LLVM_DEBUG(dbgs() << "set edge " << Dst->getName() << " -> " << SuccIdx
-                      << " successor probability to " << Prob << "\n");
+  assert(succ_size(Src) == succ_size(Dst));
+  // allocEdges can reallocate and must be called first.
+  MutableArrayRef<BranchProbability> DstP = allocEdges(Dst);
+  ArrayRef<BranchProbability> SrcP = getEdges(Src);
+  if (SrcP.empty()) {
+    // Nothing to copy from, erase again.
+    eraseBlock(Dst);
+    return;
+  }
+  for (unsigned i = 0; i != DstP.size(); ++i) {
+    DstP[i] = SrcP[i];
+    LLVM_DEBUG(dbgs() << "set edge " << Dst->getName() << " -> " << i
+                      << " successor probability to " << SrcP[i] << "\n");
   }
 }
 
 void BranchProbabilityInfo::swapSuccEdgesProbabilities(const BasicBlock *Src) {
   assert(Src->getTerminator()->getNumSuccessors() == 2);
-  auto It0 = Probs.find(std::make_pair(Src, 0));
-  if (It0 == Probs.end())
-    return; // No probability is set for edges from Src
-  auto It1 = Probs.find(std::make_pair(Src, 1));
-  assert(It1 != Probs.end());
-  std::swap(It0->second, It1->second);
+  ArrayRef<BranchProbability> P = getEdges(Src);
+  if (P.empty())
+    return;
+  MutableArrayRef<BranchProbability> MP(
+      const_cast<BranchProbability *>(P.data()), P.size());
+  std::swap(MP[0], MP[1]);
 }
 
 raw_ostream &
@@ -1197,24 +1451,10 @@ BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
 
 void BranchProbabilityInfo::eraseBlock(const BasicBlock *BB) {
   LLVM_DEBUG(dbgs() << "eraseBlock " << BB->getName() << "\n");
-
-  // Note that we cannot use successors of BB because the terminator of BB may
-  // have changed when eraseBlock is called as a BasicBlockCallbackVH callback.
-  // Instead we remove prob data for the block by iterating successors by their
-  // indices from 0 till the last which exists. There could not be prob data for
-  // a pair (BB, N) if there is no data for (BB, N-1) because the data is always
-  // set for all successors from 0 to M at once by the method
-  // setEdgeProbability().
-  Handles.erase(BasicBlockCallbackVH(BB, this));
-  for (unsigned I = 0;; ++I) {
-    auto MapI = Probs.find(std::make_pair(BB, I));
-    if (MapI == Probs.end()) {
-      assert(Probs.count(std::make_pair(BB, I + 1)) == 0 &&
-             "Must be no more successors");
-      return;
-    }
-    Probs.erase(MapI);
-  }
+  assert(BB->getParent() == LastF);
+  assert(BlockNumberEpoch == LastF->getBlockNumberEpoch());
+  if (EdgeStarts.size() > BB->getNumber())
+    EdgeStarts[BB->getNumber()] = 0;
 }
 
 void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LoopI,
@@ -1224,51 +1464,10 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LoopI,
   LLVM_DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
                     << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
-  LI = &LoopI;
-
-  SccI = std::make_unique<SccInfo>(F);
-
-  assert(EstimatedBlockWeight.empty());
-  assert(EstimatedLoopWeight.empty());
-
-  std::unique_ptr<DominatorTree> DTPtr;
-  std::unique_ptr<PostDominatorTree> PDTPtr;
-
-  if (!DT) {
-    DTPtr = std::make_unique<DominatorTree>(const_cast<Function &>(F));
-    DT = DTPtr.get();
-  }
-
-  if (!PDT) {
-    PDTPtr = std::make_unique<PostDominatorTree>(const_cast<Function &>(F));
-    PDT = PDTPtr.get();
-  }
-
-  estimateBlockWeights(F, DT, PDT);
-
-  // Walk the basic blocks in post-order so that we can build up state about
-  // the successors of a block iteratively.
-  for (const auto *BB : post_order(&F.getEntryBlock())) {
-    LLVM_DEBUG(dbgs() << "Computing probabilities for " << BB->getName()
-                      << "\n");
-    // If there is no at least two successors, no sense to set probability.
-    if (BB->getTerminator()->getNumSuccessors() < 2)
-      continue;
-    if (calcMetadataWeights(BB))
-      continue;
-    if (calcEstimatedHeuristics(BB))
-      continue;
-    if (calcPointerHeuristics(BB))
-      continue;
-    if (calcZeroHeuristics(BB, TLI))
-      continue;
-    if (calcFloatingPointHeuristics(BB))
-      continue;
-  }
-
-  EstimatedLoopWeight.clear();
-  EstimatedBlockWeight.clear();
-  SccI.reset();
+  BlockNumberEpoch = F.getBlockNumberEpoch();
+  Probs.clear();
+  EdgeStarts.clear();
+  BPIConstruction(*this).calculate(F, LoopI, TLI, DT, PDT);
 
   if (PrintBranchProb && (PrintBranchProbFuncName.empty() ||
                           F.getName() == PrintBranchProbFuncName)) {
@@ -1300,8 +1499,6 @@ bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) {
   return false;
 }
 
-void BranchProbabilityInfoWrapperPass::releaseMemory() { BPI.releaseMemory(); }
-
 void BranchProbabilityInfoWrapperPass::print(raw_ostream &OS,
                                              const Module *) const {
   BPI.print(OS);
diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp
index 1676d6b6f592e..da74691a6e6a5 100644
--- a/llvm/lib/Analysis/CFG.cpp
+++ b/llvm/lib/Analysis/CFG.cpp
@@ -35,40 +35,49 @@ static cl::opt<unsigned> DefaultMaxBBsToExplore(
 void llvm::FindFunctionBackedges(const Function &F,
      SmallVectorImpl<std::pair<const BasicBlock*,const BasicBlock*> > &Result) {
   const BasicBlock *BB = &F.getEntryBlock();
-  if (succ_empty(BB))
-    return;
 
-  SmallPtrSet<const BasicBlock*, 8> Visited;
-  SmallVector<std::pair<const BasicBlock *, const_succ_iterator>, 8> VisitStack;
-  SmallPtrSet<const BasicBlock*, 8> InStack;
+  // In the DFS traversal, we maintain three states: unvisited, visited in the
+  // past, and visited and currently in the DFS stack. If we have an edge to a
+  // block in the stack, we have found a backedge.
+  enum VisitState : uint8_t { Unvisited = 0, Visited = 1, InStack = 2 };
+  SmallVector<VisitState> BlockState(F.getMaxBlockNumber(), Unvisited);
+  struct StackEntry {
+    const BasicBlock *BB;
+    const_succ_iterator SuccIt;
+    const_succ_iterator SuccEnd;
+
+    StackEntry(const BasicBlock *BB)
+        : BB(BB), SuccIt(nullptr), SuccEnd(nullptr) {
+      auto Succs = successors(BB);
+      SuccIt = Succs.begin();
+      SuccEnd = Succs.end();
+    }
+  };
+  SmallVector<StackEntry, 8> VisitStack;
 
-  Visited.insert(BB);
-  VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
-  InStack.insert(BB);
+  BlockState[BB->getNumber()] = InStack;
+  VisitStack.emplace_back(BB);
   do {
-    std::pair<const BasicBlock *, const_succ_iterator> &Top = VisitStack.back();
-    const BasicBlock *ParentBB = Top.first;
-    const_succ_iterator &I = Top.second;
-
+    StackEntry &Top = VisitStack.back();
     bool FoundNew = false;
-    while (I != succ_end(ParentBB)) {
-      BB = *I++;
-      if (Visited.insert(BB).second) {
+    while (Top.SuccIt != Top.SuccEnd) {
+      BB = *Top.SuccIt++;
+      if (BlockState[BB->getNumber()] == Unvisited) {
+        // Unvisited successor => go down one level.
+        BlockState[BB->getNumber()] = InStack;
+        VisitStack.emplace_back(BB);
         FoundNew = true;
         break;
       }
-      // Successor is in VisitStack, it's a back edge.
-      if (InStack.count(BB))
-        Result.push_back(std::make_pair(ParentBB, BB));
+      // Successor in VisitStack => backedge.
+      if (BlockState[BB->getNumber()] == InStack)
+        Result.emplace_back(Top.BB, BB);
     }
 
-    if (FoundNew) {
-      // Go down one level if there is a unvisited successor.
-      InStack.insert(BB);
-      VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
-    } else {
-      // Go up one level.
-      InStack.erase(VisitStack.pop_back_val().first);
+    // Go up one level.
+    if (!FoundNew) {
+      BlockState[Top.BB->getNumber()] = Visited;
+      VisitStack.pop_back();
     }
   } while (!VisitStack.empty());
 }
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index f01a6f80161a5..a427a261d82d3 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -692,7 +692,7 @@ MDTuple *ResourceInfo::getAsMetadata(Module &M,
   MDVals.push_back(MDString::get(Ctx, Name));
   MDVals.push_back(getIntMD(Binding.Space));
   MDVals.push_back(getIntMD(Binding.LowerBound));
-  MDVals.push_back(getIntMD(Binding.Size));
+  MDVals.push_back(getIntMD(Binding.Size == 0 ? ~0u : Binding.Size));
 
   if (RTI.isCBuffer()) {
     MDVals.push_back(getIntMD(RTI.getCBufferSize(DL)));
@@ -1071,14 +1071,12 @@ void DXILResourceBindingInfo::populate(Module &M, DXILResourceTypeMap &DRTM) {
               cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
           Value *Name = CI->getArgOperand(4);
 
-          // UINT32_MAX (~0U) size means unbounded resource array;
+          // 0 size means unbounded resource array;
           // upper bound register overflow should be detected in Sema
-          assert((Size == UINT32_MAX ||
-                  (uint64_t)LowerBound + (uint64_t)Size - 1ULL <=
-                      (uint64_t)UINT32_MAX) &&
+          assert((Size == 0 || (uint64_t)LowerBound + (uint64_t)Size - 1ULL <=
+                                   (uint64_t)UINT32_MAX) &&
                  "upper bound register overflow");
-          uint32_t UpperBound =
-              Size == UINT32_MAX ? UINT32_MAX : LowerBound + Size - 1;
+          uint32_t UpperBound = Size == 0 ? UINT32_MAX : LowerBound + Size - 1;
           Builder.trackBinding(RTI.getResourceClass(), Space, LowerBound,
                                UpperBound, Name);
         }
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index c7aefb2035434..13f560f2958bc 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1629,14 +1629,16 @@ inferDomainOfAffine(OverflowSafeSignedAPInt A, OverflowSafeSignedAPInt B,
 // This is a modified version of the original Banerjee algorithm. The original
 // only tested whether Dst depends on Src. This algorithm extends that and
 // returns all the dependencies that exist between Dst and Src.
-bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
-                                  const SCEV *SrcConst, const SCEV *DstConst,
-                                  const Loop *CurSrcLoop,
-                                  const Loop *CurDstLoop, unsigned Level,
+bool DependenceInfo::exactSIVtest(const SCEVAddRecExpr *Src,
+                                  const SCEVAddRecExpr *Dst, unsigned Level,
                                   FullDependence &Result) const {
   if (!isDependenceTestEnabled(DependenceTestType::ExactSIV))
     return false;
 
+  const SCEV *SrcCoeff = Src->getStepRecurrence(*SE);
+  const SCEV *SrcConst = Src->getStart();
+  const SCEV *DstCoeff = Dst->getStepRecurrence(*SE);
+  const SCEV *DstConst = Dst->getStart();
   LLVM_DEBUG(dbgs() << "\tExact SIV test\n");
   LLVM_DEBUG(dbgs() << "\t    SrcCoeff = " << *SrcCoeff << " = AM\n");
   LLVM_DEBUG(dbgs() << "\t    DstCoeff = " << *DstCoeff << " = BM\n");
@@ -1674,7 +1676,7 @@ bool DependenceInfo::exactSIVtest(const SCEV *SrcCoeff, const SCEV *DstCoeff,
   std::optional<APInt> UM;
   // UM is perhaps unavailable, let's check
   if (const SCEVConstant *CUB =
-          collectConstantUpperBound(CurSrcLoop, Delta->getType())) {
+          collectConstantUpperBound(Src->getLoop(), Delta->getType())) {
     UM = CUB->getAPInt();
     LLVM_DEBUG(dbgs() << "\t    UM = " << *UM << "\n");
   }
@@ -2298,8 +2300,7 @@ bool DependenceInfo::testSIV(const SCEV *Src, const SCEV *Dst, unsigned &Level,
       disproven = weakCrossingSIVtest(SrcCoeff, SrcConst, DstConst, CurSrcLoop,
                                       CurDstLoop, Level, Result);
     else
-      disproven = exactSIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst,
-                               CurSrcLoop, CurDstLoop, Level, Result);
+      disproven = exactSIVtest(SrcAddRec, DstAddRec, Level, Result);
     return disproven || gcdMIVtest(Src, Dst, Result) ||
            symbolicRDIVtest(SrcCoeff, DstCoeff, SrcConst, DstConst, CurSrcLoop,
                             CurDstLoop);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index c387404bdaee7..da23af9c456ad 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6040,11 +6040,6 @@ static bool BrPHIToSelect(DominatorTree &DT, CondBrInst *BI, PHINode *Merge,
   BasicBlockEdge LeftEdge(BI->getParent(), BI->getSuccessor(0));
   BasicBlockEdge RightEdge(BI->getParent(), BI->getSuccessor(1));
 
-  if (!LeftEdge.isSingleEdge())
-    return false;
-
-  assert(RightEdge.isSingleEdge() && "Follows from LeftEdge.isSingleEdge()");
-
   Use &LeftUse = Merge->getOperandUse(0);
   Use &RightUse = Merge->getOperandUse(1);
 
@@ -11878,27 +11873,20 @@ bool ScalarEvolution::isLoopBackedgeGuardedByCond(const Loop *L,
     if (!PBB)
       continue;
 
-    CondBrInst *ContinuePredicate = dyn_cast<CondBrInst>(PBB->getTerminator());
-    if (!ContinuePredicate)
+    CondBrInst *ContBr = dyn_cast<CondBrInst>(PBB->getTerminator());
+    if (!ContBr || ContBr->getSuccessor(0) == ContBr->getSuccessor(1))
       continue;
 
-    Value *Condition = ContinuePredicate->getCondition();
-
     // If we have an edge `E` within the loop body that dominates the only
     // latch, the condition guarding `E` also guards the backedge.  This
     // reasoning works only for loops with a single latch.
-
-    BasicBlockEdge DominatingEdge(PBB, BB);
-    if (DominatingEdge.isSingleEdge()) {
-      // We're constructively (and conservatively) enumerating edges within the
-      // loop body that dominate the latch.  The dominator tree better agree
-      // with us on this:
-      assert(DT.dominates(DominatingEdge, Latch) && "should be!");
-
-      if (isImpliedCond(Pred, LHS, RHS, Condition,
-                        BB != ContinuePredicate->getSuccessor(0)))
-        return true;
-    }
+    // We're constructively (and conservatively) enumerating edges within the
+    // loop body that dominate the latch.  The dominator tree better agree
+    // with us on this:
+    assert(DT.dominates(BasicBlockEdge(PBB, BB), Latch) && "should be!");
+    if (isImpliedCond(Pred, LHS, RHS, ContBr->getCondition(),
+                      BB != ContBr->getSuccessor(0)))
+      return true;
   }
 
   return false;
@@ -12475,9 +12463,10 @@ bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
   // Make sure AR varies in the context block.
   if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundLHS)) {
     const Loop *L = AR->getLoop();
+    const auto *Latch = L->getLoopLatch();
     // Make sure that context belongs to the loop and executes on 1st iteration
     // (if it ever executes at all).
-    if (!L->contains(ContextBB) || !DT.dominates(ContextBB, L->getLoopLatch()))
+    if (!L->contains(ContextBB) || !Latch || !DT.dominates(ContextBB, Latch))
       return false;
     if (!isAvailableAtLoopEntry(FoundRHS, AR->getLoop()))
       return false;
@@ -12486,9 +12475,10 @@ bool ScalarEvolution::isImpliedCondOperandsViaAddRecStart(
 
   if (auto *AR = dyn_cast<SCEVAddRecExpr>(FoundRHS)) {
     const Loop *L = AR->getLoop();
+    const auto *Latch = L->getLoopLatch();
     // Make sure that context belongs to the loop and executes on 1st iteration
     // (if it ever executes at all).
-    if (!L->contains(ContextBB) || !DT.dominates(ContextBB, L->getLoopLatch()))
+    if (!L->contains(ContextBB) || !Latch || !DT.dominates(ContextBB, Latch))
       return false;
     if (!isAvailableAtLoopEntry(FoundLHS, AR->getLoop()))
       return false;
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index d71fccf8e3223..48d9eb00277b5 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -3005,7 +3005,7 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
           BasicBlock *NonNullSuccessor =
               BI->getSuccessor(NonNullIfTrue ? 0 : 1);
           BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor);
-          if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent()))
+          if (DT->dominates(Edge, CtxI->getParent()))
             return true;
         } else if (NonNullIfTrue && isGuard(Curr) &&
                    DT->dominates(cast<Instruction>(Curr), CtxI)) {
@@ -5656,6 +5656,9 @@ void computeKnownFPClass(const Value *V, const APInt &DemandedElts,
   case Instruction::FRem: {
     const bool WantNan = (InterestedClasses & fcNan) != fcNone;
 
+    if (Op->getOpcode() == Instruction::FRem)
+      Known.knownNot(fcInf);
+
     if (Op->getOperand(0) == Op->getOperand(1) &&
         isGuaranteedNotToBeUndef(Op->getOperand(0), Q.AC, Q.CxtI, Q.DT)) {
       if (Op->getOpcode() == Instruction::FDiv) {
@@ -7529,8 +7532,6 @@ bool llvm::isOverflowIntrinsicNoWrap(const WithOverflowInst *WO,
 
   auto AllUsesGuardedByBranch = [&](const CondBrInst *BI) {
     BasicBlockEdge NoWrapEdge(BI->getParent(), BI->getSuccessor(1));
-    if (!NoWrapEdge.isSingleEdge())
-      return false;
 
     // Check if all users of the add are provably no-wrap.
     for (const auto *Result : Results) {
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index b3c7bf94fa69c..bfc5f64a78838 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -487,20 +487,26 @@ lltok::Kind LLLexer::LexHash() {
   return lltok::hash;
 }
 
-/// Lex a label, integer type, keyword, or hexadecimal integer constant.
+/// Lex a label, integer or byte types, keyword, or hexadecimal integer
+/// constant.
 ///    Label           [-a-zA-Z$._0-9]+:
+///    ByteType        b[0-9]+
 ///    IntegerType     i[0-9]+
 ///    Keyword         sdiv, float, ...
 ///    HexIntConstant  [us]0x[0-9A-Fa-f]+
 lltok::Kind LLLexer::LexIdentifier() {
   const char *StartChar = CurPtr;
-  const char *IntEnd = CurPtr[-1] == 'i' ? nullptr : StartChar;
+  const char IntOrByteIdentifier = CurPtr[-1];
+  const char *IntOrByteEnd =
+      (IntOrByteIdentifier == 'i' || IntOrByteIdentifier == 'b') ? nullptr
+                                                                 : StartChar;
   const char *KeywordEnd = nullptr;
 
   for (; isLabelChar(*CurPtr); ++CurPtr) {
-    // If we decide this is an integer, remember the end of the sequence.
-    if (!IntEnd && !isdigit(static_cast<unsigned char>(*CurPtr)))
-      IntEnd = CurPtr;
+    // If we decide this is a byte or an integer, remember the end of the
+    // sequence.
+    if (!IntOrByteEnd && !isdigit(static_cast<unsigned char>(*CurPtr)))
+      IntOrByteEnd = CurPtr;
     if (!KeywordEnd && !isalnum(static_cast<unsigned char>(*CurPtr)) &&
         *CurPtr != '_')
       KeywordEnd = CurPtr;
@@ -513,18 +519,23 @@ lltok::Kind LLLexer::LexIdentifier() {
     return lltok::LabelStr;
   }
 
-  // Otherwise, this wasn't a label.  If this was valid as an integer type,
-  // return it.
-  if (!IntEnd) IntEnd = CurPtr;
-  if (IntEnd != StartChar) {
-    CurPtr = IntEnd;
+  // Otherwise, this wasn't a label. If this was valid as a byte or an integer
+  // type, return it.
+  if (!IntOrByteEnd)
+    IntOrByteEnd = CurPtr;
+  if (IntOrByteEnd != StartChar) {
+    CurPtr = IntOrByteEnd;
     uint64_t NumBits = atoull(StartChar, CurPtr);
     if (NumBits < IntegerType::MIN_INT_BITS ||
         NumBits > IntegerType::MAX_INT_BITS) {
-      LexError("bitwidth for integer type out of range");
+      LexError("bitwidth for integer or byte type out of range");
       return lltok::Error;
     }
-    TyVal = IntegerType::get(Context, NumBits);
+    if (IntOrByteIdentifier == 'i')
+      TyVal = IntegerType::get(Context, NumBits);
+    else
+      TyVal = ByteType::get(Context, NumBits);
+
     return lltok::Type;
   }
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index b7832cafc27c8..09b893b33786d 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -4189,12 +4189,13 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     if (Elts.empty())
       return error(ID.Loc, "constant vector must not be empty");
 
-    if (!Elts[0]->getType()->isIntegerTy() &&
+    if (!Elts[0]->getType()->isIntegerTy() && !Elts[0]->getType()->isByteTy() &&
         !Elts[0]->getType()->isFloatingPointTy() &&
         !Elts[0]->getType()->isPointerTy())
       return error(
           FirstEltLoc,
-          "vector elements must have integer, pointer or floating point type");
+          "vector elements must have integer, byte, pointer or floating point "
+          "type");
 
     // Verify that all the vector elements have the same type.
     for (unsigned i = 1, e = Elts.size(); i != e; ++i)
@@ -4241,15 +4242,16 @@ bool LLParser::parseValID(ValID &ID, PerFunctionState *PFS, Type *ExpectedTy) {
     ID.Kind = ValID::t_Constant;
     return false;
   }
-  case lltok::kw_c:  // c "foo"
+  case lltok::kw_c: { // c "foo"
     Lex.Lex();
-    ID.ConstantVal = ConstantDataArray::getString(Context, Lex.getStrVal(),
-                                                  false);
+    ArrayType *ATy = cast<ArrayType>(ExpectedTy);
+    ID.ConstantVal = ConstantDataArray::getString(
+        Context, Lex.getStrVal(), false, ATy->getElementType()->isByteTy());
     if (parseToken(lltok::StringConstant, "expected string"))
       return true;
     ID.Kind = ValID::t_Constant;
     return false;
-
+  }
   case lltok::kw_asm: {
     // ValID ::= 'asm' SideEffect? AlignStack? IntelDialect? STRINGCONSTANT ','
     //             STRINGCONSTANT
@@ -6704,10 +6706,11 @@ bool LLParser::convertValIDToValue(Type *Ty, ValID &ID, Value *&V,
       V = NoCFIValue::get(cast<GlobalValue>(V));
     return V == nullptr;
   case ValID::t_APSInt:
-    if (!Ty->isIntegerTy())
-      return error(ID.Loc, "integer constant must have integer type");
+    if (!Ty->isIntegerTy() && !Ty->isByteTy())
+      return error(ID.Loc, "integer/byte constant must have integer/byte type");
     ID.APSIntVal = ID.APSIntVal.extOrTrunc(Ty->getPrimitiveSizeInBits());
-    V = ConstantInt::get(Context, ID.APSIntVal);
+    Ty->isIntegerTy() ? V = ConstantInt::get(Context, ID.APSIntVal)
+                      : V = ConstantByte::get(Context, ID.APSIntVal);
     return false;
   case ValID::t_APFloat:
     if (!Ty->isFloatingPointTy() ||
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 5dbab67900dc0..d6d28ca7b5ae7 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2649,6 +2649,17 @@ Error BitcodeReader::parseTypeTableBody() {
     case bitc::TYPE_CODE_TOKEN:     // TOKEN
       ResultTy = Type::getTokenTy(Context);
       break;
+    case bitc::TYPE_CODE_BYTE: { // BYTE: [width]
+      if (Record.empty())
+        return error("Invalid record");
+
+      uint64_t NumBits = Record[0];
+      if (NumBits < ByteType::MIN_BYTE_BITS ||
+          NumBits > ByteType::MAX_BYTE_BITS)
+        return error("Bitwidth for byte type out of range");
+      ResultTy = ByteType::get(Context, NumBits);
+      break;
+    }
     case bitc::TYPE_CODE_INTEGER: { // INTEGER: [width]
       if (Record.empty())
         return error("Invalid integer record");
@@ -3347,6 +3358,20 @@ Error BitcodeReader::parseConstants() {
       V = ConstantInt::get(CurTy, VInt);
       break;
     }
+    case bitc::CST_CODE_BYTE: // BYTE: [byteval]
+      if (!CurTy->isByteOrByteVectorTy() || Record.empty())
+        return error("Invalid byte const record");
+      V = ConstantByte::get(CurTy, decodeSignRotatedValue(Record[0]));
+      break;
+    case bitc::CST_CODE_WIDE_BYTE: { // WIDE_BYTE: [n x byteval]
+      if (!CurTy->isByteOrByteVectorTy() || Record.empty())
+        return error("Invalid wide byte const record");
+
+      auto *ScalarTy = cast<ByteType>(CurTy->getScalarType());
+      APInt VByte = readWideAPInt(Record, ScalarTy->getBitWidth());
+      V = ConstantByte::get(CurTy, VByte);
+      break;
+    }
     case bitc::CST_CODE_FLOAT: {    // FLOAT: [fpval]
       if (Record.empty())
         return error("Invalid float const record");
@@ -3409,8 +3434,9 @@ Error BitcodeReader::parseConstants() {
         return error("Invalid string record");
 
       SmallString<16> Elts(Record.begin(), Record.end());
-      V = ConstantDataArray::getString(Context, Elts,
-                                       BitCode == bitc::CST_CODE_CSTRING);
+      V = ConstantDataArray::getString(
+          Context, Elts, BitCode == bitc::CST_CODE_CSTRING,
+          cast<ArrayType>(CurTy)->getElementType()->isByteTy());
       break;
     }
     case bitc::CST_CODE_DATA: {// DATA: [n x value]
@@ -3446,6 +3472,30 @@ Error BitcodeReader::parseConstants() {
           V = ConstantDataVector::get(Context, Elts);
         else
           V = ConstantDataArray::get(Context, Elts);
+      } else if (EltTy->isByteTy(8)) {
+        SmallVector<uint8_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::getByte(EltTy, Elts);
+        else
+          V = ConstantDataArray::getByte(EltTy, Elts);
+      } else if (EltTy->isByteTy(16)) {
+        SmallVector<uint16_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::getByte(EltTy, Elts);
+        else
+          V = ConstantDataArray::getByte(EltTy, Elts);
+      } else if (EltTy->isByteTy(32)) {
+        SmallVector<uint32_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::getByte(EltTy, Elts);
+        else
+          V = ConstantDataArray::getByte(EltTy, Elts);
+      } else if (EltTy->isByteTy(64)) {
+        SmallVector<uint64_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::getByte(EltTy, Elts);
+        else
+          V = ConstantDataArray::getByte(EltTy, Elts);
       } else if (EltTy->isHalfTy()) {
         SmallVector<uint16_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 568874b145c8d..ce7449354ba8e 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -136,6 +136,7 @@ enum {
   // CONSTANTS_BLOCK abbrev id's.
   CONSTANTS_SETTYPE_ABBREV = bitc::FIRST_APPLICATION_ABBREV,
   CONSTANTS_INTEGER_ABBREV,
+  CONSTANTS_BYTE_ABBREV,
   CONSTANTS_CE_CAST_Abbrev,
   CONSTANTS_NULL_Abbrev,
 
@@ -1173,6 +1174,11 @@ void ModuleBitcodeWriter::writeTypeTable() {
       break;
     case Type::X86_AMXTyID:   Code = bitc::TYPE_CODE_X86_AMX;   break;
     case Type::TokenTyID:     Code = bitc::TYPE_CODE_TOKEN;     break;
+    case Type::ByteTyID:
+      // BYTE: [width]
+      Code = bitc::TYPE_CODE_BYTE;
+      TypeVals.push_back(T->getByteBitWidth());
+      break;
     case Type::IntegerTyID:
       // INTEGER: [width]
       Code = bitc::TYPE_CODE_INTEGER;
@@ -2871,6 +2877,16 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
         emitWideAPInt(Record, IV->getValue());
         Code = bitc::CST_CODE_WIDE_INTEGER;
       }
+    } else if (const ConstantByte *BV = dyn_cast<ConstantByte>(C)) {
+      if (BV->getBitWidth() <= 64) {
+        uint64_t V = BV->getSExtValue();
+        emitSignedInt64(Record, V);
+        Code = bitc::CST_CODE_BYTE;
+        AbbrevToUse = CONSTANTS_BYTE_ABBREV;
+      } else { // Wide bytes, > 64 bits in size.
+        emitWideAPInt(Record, BV->getValue());
+        Code = bitc::CST_CODE_WIDE_BYTE;
+      }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
       Type *Ty = CFP->getType()->getScalarType();
@@ -2920,10 +2936,10 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
       else if (isCStr7)
         AbbrevToUse = CString7Abbrev;
     } else if (const ConstantDataSequential *CDS =
-                  dyn_cast<ConstantDataSequential>(C)) {
+                   dyn_cast<ConstantDataSequential>(C)) {
       Code = bitc::CST_CODE_DATA;
       Type *EltTy = CDS->getElementType();
-      if (isa<IntegerType>(EltTy)) {
+      if (isa<IntegerType>(EltTy) || isa<ByteType>(EltTy)) {
         for (uint64_t i = 0, e = CDS->getNumElements(); i != e; ++i)
           Record.push_back(CDS->getElementAsInteger(i));
       } else {
@@ -3968,6 +3984,15 @@ void ModuleBitcodeWriter::writeBlockInfo() {
       llvm_unreachable("Unexpected abbrev ordering!");
   }
 
+  { // BYTE abbrev for CONSTANTS_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_BYTE));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 8));
+    if (Stream.EmitBlockInfoAbbrev(bitc::CONSTANTS_BLOCK_ID, Abbv) !=
+        CONSTANTS_BYTE_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
+
   { // CE_CAST abbrev for CONSTANTS_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
     Abbv->Add(BitCodeAbbrevOp(bitc::CST_CODE_CE_CAST));
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 40a80576ba86b..d3a9df4fd9817 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3802,6 +3802,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV,
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
     return MCConstantExpr::create(CI->getZExtValue(), Ctx);
 
+  if (const ConstantByte *CB = dyn_cast<ConstantByte>(CV))
+    return MCConstantExpr::create(CB->getZExtValue(), Ctx);
+
   if (const ConstantPtrAuth *CPA = dyn_cast<ConstantPtrAuth>(CV))
     return lowerConstantPtrAuth(*CPA);
 
@@ -4057,7 +4060,8 @@ static void emitGlobalConstantDataSequential(
 
   // Otherwise, emit the values in successive locations.
   uint64_t ElementByteSize = CDS->getElementByteSize();
-  if (isa<IntegerType>(CDS->getElementType())) {
+  if (isa<IntegerType>(CDS->getElementType()) ||
+      isa<ByteType>(CDS->getElementType())) {
     for (uint64_t I = 0, E = CDS->getNumElements(); I != E; ++I) {
       emitGlobalAliasInline(AP, ElementByteSize * I, AliasList);
       if (AP.isVerbose())
@@ -4221,13 +4225,15 @@ static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
   emitGlobalConstantFP(CFP->getValueAPF(), CFP->getType(), AP);
 }
 
-static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
+static void emitGlobalConstantLargeAPInt(const APInt &Val,
+                                         uint64_t TypeStoreSize,
+                                         AsmPrinter &AP) {
   const DataLayout &DL = AP.getDataLayout();
-  unsigned BitWidth = CI->getBitWidth();
+  unsigned BitWidth = Val.getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
   // is not a multiple of 64-bits.
-  APInt Realigned(CI->getValue());
+  APInt Realigned(Val);
   uint64_t ExtraBits = 0;
   unsigned ExtraBitsSize = BitWidth & 63;
 
@@ -4249,36 +4255,47 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
       // ExtraBits     0       1       (BitWidth / 64) - 1
       //       chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN]
       ExtraBitsSize = alignTo(ExtraBitsSize, 8);
-      ExtraBits = Realigned.getRawData()[0] &
-        (((uint64_t)-1) >> (64 - ExtraBitsSize));
+      ExtraBits =
+          Realigned.getRawData()[0] & (((uint64_t)-1) >> (64 - ExtraBitsSize));
       if (BitWidth >= 64)
         Realigned.lshrInPlace(ExtraBitsSize);
     } else
       ExtraBits = Realigned.getRawData()[BitWidth / 64];
   }
 
-  // We don't expect assemblers to support integer data directives
+  // We don't expect assemblers to support data directives
   // for more than 64 bits, so we emit the data in at most 64-bit
   // quantities at a time.
   const uint64_t *RawData = Realigned.getRawData();
   for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
-    uint64_t Val = DL.isBigEndian() ? RawData[e - i - 1] : RawData[i];
-    AP.OutStreamer->emitIntValue(Val, 8);
+    uint64_t ChunkVal = DL.isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    AP.OutStreamer->emitIntValue(ChunkVal, 8);
   }
 
   if (ExtraBitsSize) {
     // Emit the extra bits after the 64-bits chunks.
 
     // Emit a directive that fills the expected size.
-    uint64_t Size = AP.getDataLayout().getTypeStoreSize(CI->getType());
-    Size -= (BitWidth / 64) * 8;
+    uint64_t Size = TypeStoreSize - (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
-           (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
-           == ExtraBits && "Directive too small for extra bits.");
+           (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize))) ==
+               ExtraBits &&
+           "Directive too small for extra bits.");
     AP.OutStreamer->emitIntValue(ExtraBits, Size);
   }
 }
 
+static void emitGlobalConstantLargeByte(const ConstantByte *CB,
+                                        AsmPrinter &AP) {
+  emitGlobalConstantLargeAPInt(
+      CB->getValue(), AP.getDataLayout().getTypeStoreSize(CB->getType()), AP);
+}
+
+static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
+  emitGlobalConstantLargeAPInt(
+      CI->getValue(), AP.getDataLayout().getTypeStoreSize(CI->getType()), AP);
+}
+
 /// Transform a not absolute MCExpr containing a reference to a GOT
 /// equivalent global, by a target specific GOT pc relative access to the
 /// final symbol.
@@ -4424,6 +4441,27 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
     return;
   }
 
+  if (const ConstantByte *CB = dyn_cast<ConstantByte>(CV)) {
+    if (isa<VectorType>(CV->getType()))
+      return emitGlobalConstantVector(DL, CV, AP, AliasList);
+
+    const uint64_t StoreSize = DL.getTypeStoreSize(CV->getType());
+    if (StoreSize <= 8) {
+      if (AP.isVerbose())
+        AP.OutStreamer->getCommentOS()
+            << format("0x%" PRIx64 "\n", CB->getZExtValue());
+      AP.OutStreamer->emitIntValue(CB->getZExtValue(), StoreSize);
+    } else {
+      emitGlobalConstantLargeByte(CB, AP);
+    }
+
+    // Emit tail padding if needed
+    if (Size != StoreSize)
+      AP.OutStreamer->emitZeros(Size - StoreSize);
+
+    return;
+  }
+
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
     if (isa<VectorType>(CV->getType()))
       return emitGlobalConstantVector(DL, CV, AP, AliasList);
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 8eafb97215883..a325c31faf417 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -160,7 +160,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionLegacyPassPass(Registry);
   initializeTypePromotionLegacyPass(Registry);
-  initializeUnpackMachineBundlesPass(Registry);
+  initializeUnpackMachineBundlesLegacyPass(Registry);
   initializeUnreachableBlockElimLegacyPassPass(Registry);
   initializeUnreachableMachineBlockElimLegacyPass(Registry);
   initializeVirtRegMapWrapperLegacyPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 18dda915409f2..1d6f0f2a908e0 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -313,8 +313,8 @@ class CodeGenPrepare {
   const BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
   const TargetLibraryInfo *TLInfo = nullptr;
   LoopInfo *LI = nullptr;
-  std::unique_ptr<BlockFrequencyInfo> BFI;
-  std::unique_ptr<BranchProbabilityInfo> BPI;
+  BlockFrequencyInfo *BFI;
+  BranchProbabilityInfo *BPI;
   ProfileSummaryInfo *PSI = nullptr;
 
   /// As we scan instructions optimizing them, this is the next instruction
@@ -384,8 +384,6 @@ class CodeGenPrepare {
     InsertedInsts.clear();
     PromotedInsts.clear();
     FreshBBs.clear();
-    BPI.reset();
-    BFI.reset();
   }
 
   bool run(Function &F, FunctionAnalysisManager &AM);
@@ -498,6 +496,8 @@ class CodeGenPrepareLegacyPass : public FunctionPass {
     AU.addRequired<TargetPassConfig>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
     AU.addUsedIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
   }
 };
@@ -518,8 +518,8 @@ bool CodeGenPrepareLegacyPass::runOnFunction(Function &F) {
   CGP.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   CGP.TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   CGP.LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  CGP.BPI.reset(new BranchProbabilityInfo(F, *CGP.LI));
-  CGP.BFI.reset(new BlockFrequencyInfo(F, *CGP.BPI, *CGP.LI));
+  CGP.BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  CGP.BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
   CGP.PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   auto BBSPRWP =
       getAnalysisIfAvailable<BasicBlockSectionsProfileReaderWrapperPass>();
@@ -565,8 +565,8 @@ bool CodeGenPrepare::run(Function &F, FunctionAnalysisManager &AM) {
   TLInfo = &AM.getResult<TargetLibraryAnalysis>(F);
   TTI = &AM.getResult<TargetIRAnalysis>(F);
   LI = &AM.getResult<LoopAnalysis>(F);
-  BPI.reset(new BranchProbabilityInfo(F, *LI));
-  BFI.reset(new BlockFrequencyInfo(F, *BPI, *LI));
+  BPI = &AM.getResult<BranchProbabilityAnalysis>(F);
+  BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
   PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   BBSectionsProfileReader =
@@ -611,7 +611,7 @@ bool CodeGenPrepare::_run(Function &F) {
       // bypassSlowDivision may create new BBs, but we don't want to reapply the
       // optimization to those blocks.
       BasicBlock *Next = BB->getNextNode();
-      if (!llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
+      if (!llvm::shouldOptimizeForSize(BB, PSI, BFI))
         EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
       BB = Next;
     }
@@ -2694,7 +2694,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
   // ensure that we can fold all uses of a potential addressing computation
   // into their uses.  TODO: generalize this to work over profiling data
   if (CI->hasFnAttr(Attribute::Cold) &&
-      !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
+      !llvm::shouldOptimizeForSize(BB, PSI, BFI))
     for (auto &Arg : CI->args()) {
       if (!Arg->getType()->isPointerTy())
         continue;
@@ -5893,7 +5893,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
         V, AccessTy, AddrSpace, MemoryInst, AddrModeInsts, *TLI, *LI, getDTFn,
         *TRI, InsertedInsts, PromotedInsts, TPT, LargeOffsetGEP, OptSize, PSI,
-        BFI.get());
+        BFI);
 
     GetElementPtrInst *GEP = LargeOffsetGEP.first;
     if (GEP && !NewGEPBases.count(GEP)) {
@@ -7734,7 +7734,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
 
   if (TLI->isSelectSupported(SelectKind) &&
       (!isFormingBranchFromSelectProfitable(TTI, TLI, SI) ||
-       llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get())))
+       llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI)))
     return false;
 
   // The DominatorTree needs to be rebuilt by any consumers after this
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index ff06864d3f6a9..a702d2aa77c74 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -636,13 +636,15 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
 }
 
 std::string codegen::getCPUStr() {
+  std::string MCPU = getMCPU();
+
   // If user asked for the 'native' CPU, autodetect here. If autodection fails,
   // this will set the CPU to an empty string which tells the target to
   // pick a basic default.
-  if (getMCPU() == "native")
+  if (MCPU == "native")
     return std::string(sys::getHostCPUName());
 
-  return getMCPU();
+  return MCPU;
 }
 
 std::string codegen::getFeaturesStr() {
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index c62cd50f4fd98..fdd8f490eb52f 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -461,7 +461,7 @@ class MIParser {
                             std::optional<unsigned> &TiedDefIdx,
                             bool IsDef = false);
   bool parseImmediateOperand(MachineOperand &Dest);
-  bool parseInlineAsmOperand(MachineOperand &Dest);
+  bool parseSymbolicInlineAsmOperand(unsigned OpIdx, MachineOperand &Dest);
   bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
                        const Constant *&C);
   bool parseIRConstant(StringRef::iterator Loc, const Constant *&C);
@@ -1903,9 +1903,42 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseInlineAsmOperand(MachineOperand &Dest) {
+bool MIParser::parseSymbolicInlineAsmOperand(unsigned OpIdx,
+                                             MachineOperand &Dest) {
+  assert(OpIdx >= InlineAsm::MIOp_ExtraInfo);
+  assert(Token.is(MIToken::Identifier) &&
+         "expected symbolic inline asm operand");
+
+  // Parse ExtraInfo flags.
+  if (OpIdx == InlineAsm::MIOp_ExtraInfo) {
+    unsigned ExtraInfo = 0;
+    for (;;) {
+      if (Token.isNot(MIToken::Identifier))
+        break;
+
+      StringRef FlagName = Token.stringValue();
+      unsigned Flag = StringSwitch<unsigned>(FlagName)
+                          .Case("sideeffect", InlineAsm::Extra_HasSideEffects)
+                          .Case("mayload", InlineAsm::Extra_MayLoad)
+                          .Case("maystore", InlineAsm::Extra_MayStore)
+                          .Case("isconvergent", InlineAsm::Extra_IsConvergent)
+                          .Case("alignstack", InlineAsm::Extra_IsAlignStack)
+                          .Case("unwind", InlineAsm::Extra_MayUnwind)
+                          .Case("attdialect", 0)
+                          .Case("inteldialect", InlineAsm::Extra_AsmDialect)
+                          .Default(~0u);
+      if (Flag == ~0u)
+        return error("unknown inline asm extra info flag '" + FlagName + "'");
+
+      ExtraInfo |= Flag;
+      lex();
+    }
+
+    Dest = MachineOperand::CreateImm(ExtraInfo);
+    return false;
+  }
+
   // Parse symbolic form: kind[:constraint].
-  assert(Token.is(MIToken::Identifier) && "expected inline asm operand kind");
   StringRef KindStr = Token.stringValue();
   constexpr auto InvalidKind = static_cast<InlineAsm::Kind>(0);
   InlineAsm::Kind K =
@@ -1917,7 +1950,8 @@ bool MIParser::parseInlineAsmOperand(MachineOperand &Dest) {
           .Case("imm", InlineAsm::Kind::Imm)
           .Case("mem", InlineAsm::Kind::Mem)
           .Default(InvalidKind);
-  assert(K != InvalidKind && "unknown inline asm operand kind");
+  if (K == InvalidKind)
+    return error("unknown inline asm operand kind '" + KindStr + "'");
 
   lex();
 
@@ -3172,14 +3206,12 @@ bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
   case MIToken::Error:
     return true;
   case MIToken::Identifier: {
+    bool IsInlineAsm = OpCode == TargetOpcode::INLINEASM ||
+                       OpCode == TargetOpcode::INLINEASM_BR;
+    if (IsInlineAsm)
+      return parseSymbolicInlineAsmOperand(OpIdx, Dest);
+
     StringRef Id = Token.stringValue();
-    bool IsInlineAsmOperand = (OpCode == TargetOpcode::INLINEASM ||
-                               OpCode == TargetOpcode::INLINEASM_BR) &&
-                              OpIdx >= InlineAsm::MIOp_FirstOperand;
-    if (IsInlineAsmOperand &&
-        (Id == "regdef" || Id == "reguse" || Id == "regdef-ec" ||
-         Id == "clobber" || Id == "imm" || Id == "mem"))
-      return parseInlineAsmOperand(Dest);
     if (const auto *RegMask = PFS.Target.getRegMask(Id)) {
       Dest = MachineOperand::CreateRegMask(RegMask);
       lex();
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 9de35f3309a8a..250b40863c9f3 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -975,6 +975,12 @@ static void printMIOperand(raw_ostream &OS, MFPrintState &State,
       break;
     }
     if (PrintSymbolicInlineAsmOps && MI.isInlineAsm()) {
+      if (OpIdx == InlineAsm::MIOp_ExtraInfo) {
+        unsigned ExtraInfo = Op.getImm();
+        interleave(InlineAsm::getExtraInfoNames(ExtraInfo), OS, " ");
+        break;
+      }
+
       int FlagIdx = MI.findInlineAsmFlagIdx(OpIdx);
       if (FlagIdx >= 0 && (unsigned)FlagIdx == OpIdx) {
         InlineAsm::Flag F(Op.getImm());
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index b36d950438fa5..1042856454adf 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -927,7 +927,7 @@ bool MachineInstr::isStackAligningInlineAsm() const {
 InlineAsm::AsmDialect MachineInstr::getInlineAsmDialect() const {
   assert(isInlineAsm() && "getInlineAsmDialect() only works for inline asms!");
   unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
-  return InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect) != 0);
+  return InlineAsm::getDialect(ExtraInfo);
 }
 
 int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index c9f01e30483bf..663b755193695 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -10,40 +10,24 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysisManager.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include <utility>
 using namespace llvm;
 
-namespace {
-  class UnpackMachineBundles : public MachineFunctionPass {
-  public:
-    static char ID; // Pass identification
-    UnpackMachineBundles(
-        std::function<bool(const MachineFunction &)> Ftor = nullptr)
-        : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {}
-
-    bool runOnMachineFunction(MachineFunction &MF) override;
-
-  private:
-    std::function<bool(const MachineFunction &)> PredicateFtor;
-  };
-} // end anonymous namespace
-
-char UnpackMachineBundles::ID = 0;
-char &llvm::UnpackMachineBundlesID = UnpackMachineBundles::ID;
-INITIALIZE_PASS(UnpackMachineBundles, "unpack-mi-bundles",
-                "Unpack machine instruction bundles", false, false)
-
-bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) {
-  if (PredicateFtor && !PredicateFtor(MF))
+static bool unpackBundles(MachineFunction &MF,
+                          std::function<bool(const MachineFunction &)> Ftor) {
+  if (Ftor && !Ftor(MF))
     return false;
 
   bool Changed = false;
@@ -75,10 +59,42 @@ bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
-FunctionPass *
-llvm::createUnpackMachineBundles(
+namespace {
+
+class UnpackMachineBundlesLegacy : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification
+  UnpackMachineBundlesLegacy(
+      std::function<bool(const MachineFunction &)> Ftor = nullptr)
+      : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  std::function<bool(const MachineFunction &)> PredicateFtor;
+};
+} // end anonymous namespace
+
+PreservedAnalyses
+UnpackMachineBundlesPass::run(MachineFunction &MF,
+                              MachineFunctionAnalysisManager &MFAM) {
+  if (unpackBundles(MF, PredicateFtor))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+char UnpackMachineBundlesLegacy::ID = 0;
+char &llvm::UnpackMachineBundlesID = UnpackMachineBundlesLegacy::ID;
+INITIALIZE_PASS(UnpackMachineBundlesLegacy, "unpack-mi-bundles",
+                "Unpack machine instruction bundles", false, false)
+
+bool UnpackMachineBundlesLegacy::runOnMachineFunction(MachineFunction &MF) {
+  return unpackBundles(MF, PredicateFtor);
+}
+
+FunctionPass *llvm::createUnpackMachineBundlesLegacy(
     std::function<bool(const MachineFunction &)> Ftor) {
-  return new UnpackMachineBundles(std::move(Ftor));
+  return new UnpackMachineBundlesLegacy(std::move(Ftor));
 }
 
 /// Return the first DebugLoc that has line number information, given a
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 781001d050f3d..0f4503ae27998 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5478,16 +5478,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
-    // fold (urem x, (lshr pow2, y)) -> (and x, (add (lshr pow2, y), -1))
-    // TODO: We should sink the following into isKnownToBePowerOfTwo
-    // using a OrZero parameter analogous to our handling in ValueTracking.
-    if (N1.getOpcode() == ISD::SRL &&
-        DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
-      SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
-      AddToWorklist(Add.getNode());
-      return DAG.getNode(ISD::AND, DL, VT, N0, Add);
-    }
   }
 
   AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 08606c99097ae..5e54343f7f146 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1267,6 +1267,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(
         Node->getOpcode(), Node->getOperand(1).getValueType());
     break;
+  case ISD::CTTZ_ELTS:
+  case ISD::CTTZ_ELTS_ZERO_POISON:
   case ISD::VP_CTTZ_ELTS:
   case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
     Action = TLI.getOperationAction(Node->getOpcode(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index a7d4b5508000a..4a27f804d6720 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -78,6 +78,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VP_CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
+  case ISD::CTTZ_ELTS_ZERO_POISON:
+  case ISD::CTTZ_ELTS:
   case ISD::VP_CTTZ_ELTS_ZERO_UNDEF:
   case ISD::VP_CTTZ_ELTS:
     Res = PromoteIntRes_VP_CttzElements(N);
@@ -3240,6 +3242,11 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::READ_REGISTER:
     ExpandIntRes_READ_REGISTER(N, Lo, Hi);
     break;
+
+  case ISD::CTTZ_ELTS:
+  case ISD::CTTZ_ELTS_ZERO_POISON:
+    ExpandIntRes_CTTZ_ELTS(N, Lo, Hi);
+    break;
   }
 
   // If Lo/Hi is null, the sub-method took care of registering results etc.
@@ -5559,6 +5566,20 @@ void DAGTypeLegalizer::ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo,
   Hi = DAG.getPOISON(HiVT);
 }
 
+void DAGTypeLegalizer::ExpandIntRes_CTTZ_ELTS(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  // Assume that the maximum number of vector elements fits in getVectorIdxTy
+  // and expand to that.
+  EVT VT = N->getSimpleValueType(0);
+  EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+  assert(IdxVT.bitsLT(VT) &&
+         "VectorIdxTy should be smaller than type to be expanded?");
+
+  SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N), IdxVT, N->getOperand(0));
+  Res = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Res);
+  SplitInteger(Res, Lo, Hi);
+}
+
 //===----------------------------------------------------------------------===//
 //  Integer Operand Expansion
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index a8ffb66a9d911..14f361f8bcaed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -502,6 +502,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
 
   void ExpandIntRes_VSCALE            (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTTZ_ELTS(SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
                              SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 1c5b2d00fe83c..8e06325c3a8d5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4743,12 +4743,12 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val,
   case ISD::SRL: {
     // A logical right-shift of a constant sign-bit will have exactly
     // one bit set.
-    auto *C = isConstOrConstSplat(Val.getOperand(0));
+    auto *C = isConstOrConstSplat(Val.getOperand(0), DemandedElts);
     if (C && C->getAPIntValue().isSignMask())
       return true;
-    return isKnownToBeAPowerOfTwo(Val.getOperand(0), /*OrZero=*/false,
-                                  Depth + 1) &&
-           isKnownNeverZero(Val, Depth);
+    return (OrZero || isKnownNeverZero(Val, DemandedElts, Depth)) &&
+           isKnownToBeAPowerOfTwo(Val.getOperand(0), DemandedElts, OrZero,
+                                  Depth + 1);
   }
 
   case ISD::ROTL:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0a1fb2aed6ca3..eb55a68eaba84 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1865,6 +1865,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
       return DAG.getConstant(*CI, DL, VT);
     }
 
+    if (const ConstantByte *CB = dyn_cast<ConstantByte>(C))
+      return DAG.getConstant(CB->getValue(), getCurSDLoc(), VT);
+
     if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
       return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
 
@@ -8318,9 +8321,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));
     EVT OpVT = Op.getValueType();
+    EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    bool ZeroIsPoison =
+        !cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero();
 
     if (!TLI.shouldExpandCttzElements(OpVT)) {
-      visitTargetIntrinsic(I, Intrinsic);
+      SDValue Ret = DAG.getNode(ZeroIsPoison ? ISD::CTTZ_ELTS_ZERO_POISON
+                                             : ISD::CTTZ_ELTS,
+                                sdl, RetTy, Op);
+      setValue(&I, Ret);
       return;
     }
 
@@ -8334,8 +8343,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     // If the zero-is-poison flag is set, we can assume the upper limit
     // of the result is VF-1.
-    bool ZeroIsPoison =
-        !cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero();
     ConstantRange VScaleRange(1, true); // Dummy value.
     if (isa<ScalableVectorType>(I.getOperand(0)->getType()))
       VScaleRange = getVScaleRange(I.getCaller(), 64);
@@ -8359,7 +8366,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And);
     SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max);
 
-    EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
     SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy);
 
     setValue(&I, Ret);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 571830cc57b52..7161dd299f830 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -591,6 +591,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EXPERIMENTAL_VECTOR_HISTOGRAM:
     return "histogram";
 
+  case ISD::CTTZ_ELTS:
+    return "cttz_elts";
+  case ISD::CTTZ_ELTS_ZERO_POISON:
+    return "cttz_elts_zero_poison";
+
   case ISD::VECTOR_FIND_LAST_ACTIVE:
     return "find_last_active";
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 5748ef89aef4e..aa76fae674774 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8201,13 +8201,17 @@ bool TargetLowering::expandDIVREMByConstant(SDNode *N,
                                  DAG.getConstant(Mask, dl, HiLoVT));
       }
 
-      LL = DAG.getNode(
-          ISD::OR, dl, HiLoVT,
-          DAG.getNode(ISD::SRL, dl, HiLoVT, LL,
-                      DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
-          DAG.getNode(ISD::SHL, dl, HiLoVT, LH,
-                      DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
-                                                 HiLoVT, dl)));
+      if (isOperationLegal(ISD::FSHR, HiLoVT))
+        LL = DAG.getNode(ISD::FSHR, dl, HiLoVT, LH, LL,
+                         DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
+      else
+        LL = DAG.getNode(
+            ISD::OR, dl, HiLoVT,
+            DAG.getNode(ISD::SRL, dl, HiLoVT, LL,
+                        DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl)),
+            DAG.getNode(ISD::SHL, dl, HiLoVT, LH,
+                        DAG.getShiftAmountConstant(HBitWidth - TrailingZeros,
+                                                   HiLoVT, dl)));
       LH = DAG.getNode(ISD::SRL, dl, HiLoVT, LH,
                        DAG.getShiftAmountConstant(TrailingZeros, HiLoVT, dl));
     }
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9d1732ea5aacc..b6d5a4c22e133 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1231,6 +1231,10 @@ void TargetLoweringBase::initActions() {
     // Only some target support this vector operation. Most need to expand it.
     setOperationAction(ISD::VECTOR_COMPRESS, VT, Expand);
 
+    // cttz.elts defaults to expand.
+    setOperationAction({ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON}, VT,
+                       Expand);
+
     // VP operations default to expand.
 #define BEGIN_REGISTER_VP_SDNODE(SDOPC, ...)                                   \
     setOperationAction(ISD::SDOPC, VT, Expand);
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 0743c92c5b95b..e74068e22f4cd 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -254,6 +254,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
     llvm_unreachable("Unknown type!");
   case Type::VoidTyID:
     return MVT::isVoid;
+  case Type::ByteTyID:
+    return getIntegerVT(cast<ByteType>(Ty)->getBitWidth());
   case Type::IntegerTyID:
     return getIntegerVT(cast<IntegerType>(Ty)->getBitWidth());
   case Type::HalfTyID:      return MVT(MVT::f16);
@@ -304,6 +306,8 @@ EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
     return MVT::getVT(Ty, HandleUnknown);
   case Type::TokenTyID:
     return MVT::Untyped;
+  case Type::ByteTyID:
+    return getIntegerVT(Ty->getContext(), cast<ByteType>(Ty)->getBitWidth());
   case Type::IntegerTyID:
     return getIntegerVT(Ty->getContext(), cast<IntegerType>(Ty)->getBitWidth());
   case Type::FixedVectorTyID:
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 38e7ea3203afd..b5637f81e58db 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -348,7 +348,7 @@ unsigned DWARFVerifier::verifyDebugInfoCallSite(const DWARFDie &Die) {
     return 0;
 
   DWARFDie Curr = Die.getParent();
-  for (; Curr.isValid() && !Curr.isSubprogramDIE(); Curr = Die.getParent()) {
+  for (; Curr.isValid() && !Curr.isSubprogramDIE(); Curr = Curr.getParent()) {
     if (Curr.getTag() == DW_TAG_inlined_subroutine) {
       ErrorCategory.Report(
           "Call site nested entry within inlined subroutine", [&]() {
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 4cbf0f9572be5..8148e113195cc 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2431,11 +2431,18 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
   return Builder.saveIP();
 }
 
+llvm::StructType *OpenMPIRBuilder::getKmpTaskAffinityInfoTy() {
+  llvm::Type *IntPtrTy = llvm::Type::getIntNTy(
+      M.getContext(), M.getDataLayout().getPointerSizeInBits());
+  return llvm::StructType::get(IntPtrTy, IntPtrTy,
+                               llvm::Type::getInt32Ty(M.getContext()));
+}
+
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
-    SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
-    Value *Priority) {
+    SmallVector<DependData> Dependencies, AffinityData Affinities,
+    bool Mergeable, Value *EventHandle, Value *Priority) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -2481,8 +2488,8 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
       Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
 
   OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
-                      Mergeable, Priority, EventHandle, TaskAllocaBB,
-                      ToBeDeleted](Function &OutlinedFn) mutable {
+                      Affinities, Mergeable, Priority, EventHandle,
+                      TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
     // Replace the Stale CI by appropriate RTL function call.
     assert(OutlinedFn.hasOneUse() &&
            "there must be a single user for the outlined function");
@@ -2555,6 +2562,14 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
                       /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
                       /*task_func=*/&OutlinedFn});
 
+    if (Affinities.Count && Affinities.Info) {
+      Function *RegAffFn = getOrCreateRuntimeFunctionPtr(
+          OMPRTL___kmpc_omp_reg_task_with_affinity);
+
+      createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
+                                           Affinities.Count, Affinities.Info});
+    }
+
     // Emit detach clause initialization.
     // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
     // task_descriptor);
@@ -5585,11 +5600,19 @@ OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
   Constant *One = ConstantInt::get(InternalIVTy, 1);
 
   Function *F = CLI->getFunction();
+  // Blocks must have terminators.
+  // FIXME: Don't run analyses on incomplete/invalid IR.
+  SmallVector<Instruction *> UIs;
+  for (BasicBlock &BB : *F)
+    if (!BB.getTerminator())
+      UIs.push_back(new UnreachableInst(F->getContext(), &BB));
   FunctionAnalysisManager FAM;
   FAM.registerPass([]() { return DominatorTreeAnalysis(); });
   FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
   LoopAnalysis LIA;
   LoopInfo &&LI = LIA.run(*F, FAM);
+  for (Instruction *I : UIs)
+    I->eraseFromParent();
   Loop *L = LI.getLoopFor(CLI->getHeader());
   SmallVector<Metadata *> LoopMDList;
   if (ChunkSize || DistScheduleChunkSize)
@@ -6874,6 +6897,13 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
 
   Function *F = CanonicalLoop->getFunction();
 
+  // Blocks must have terminators.
+  // FIXME: Don't run analyses on incomplete/invalid IR.
+  SmallVector<Instruction *> UIs;
+  for (BasicBlock &BB : *F)
+    if (!BB.getTerminator())
+      UIs.push_back(new UnreachableInst(F->getContext(), &BB));
+
   // TODO: We should not rely on pass manager. Currently we use pass manager
   // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
   // object. We should have a method  which returns all blocks between
@@ -6886,6 +6916,9 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
   LoopAnalysis LIA;
   LoopInfo &&LI = LIA.run(*F, FAM);
 
+  for (Instruction *I : UIs)
+    I->eraseFromParent();
+
   Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
   if (AlignedVars.size()) {
     InsertPointTy IP = Builder.saveIP();
@@ -7003,6 +7036,13 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
   CodeGenOptLevel OptLevel = CodeGenOptLevel::Aggressive;
   std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
 
+  // Blocks must have terminators.
+  // FIXME: Don't run analyses on incomplete/invalid IR.
+  SmallVector<Instruction *> UIs;
+  for (BasicBlock &BB : *F)
+    if (!BB.getTerminator())
+      UIs.push_back(new UnreachableInst(F->getContext(), &BB));
+
   FunctionAnalysisManager FAM;
   FAM.registerPass([]() { return TargetLibraryAnalysis(); });
   FAM.registerPass([]() { return AssumptionAnalysis(); });
@@ -7027,6 +7067,9 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
   AssumptionCache &&AC = ACT.run(*F, FAM);
   OptimizationRemarkEmitter ORE{F};
 
+  for (Instruction *I : UIs)
+    I->eraseFromParent();
+
   Loop *L = LI.getLoopFor(CLI->getHeader());
   assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
 
@@ -11545,6 +11588,279 @@ void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
   loadOffloadInfoMetadata(*M.get());
 }
 
+OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createIteratorLoop(
+    LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen,
+    llvm::StringRef Name) {
+  Builder.restoreIP(Loc.IP);
+
+  BasicBlock *CurBB = Builder.GetInsertBlock();
+  assert(CurBB &&
+         "expected a valid insertion block for creating an iterator loop");
+  Function *F = CurBB->getParent();
+
+  InsertPointTy SplitIP = Builder.saveIP();
+  if (SplitIP.getPoint() == CurBB->end())
+    if (Instruction *Terminator = CurBB->getTerminator())
+      SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
+
+  BasicBlock *ContBB =
+      splitBB(SplitIP, /*CreateBranch=*/false,
+              Builder.getCurrentDebugLocation(), "omp.it.cont");
+
+  CanonicalLoopInfo *CLI =
+      createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
+                         /*PreInsertBefore=*/ContBB,
+                         /*PostInsertBefore=*/ContBB, Name);
+
+  // Enter loop from original block.
+  redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
+
+  // Remove the unconditional branch inserted by createLoopSkeleton in the body
+  if (Instruction *T = CLI->getBody()->getTerminator())
+    T->eraseFromParent();
+
+  InsertPointTy BodyIP = CLI->getBodyIP();
+  if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
+    return Err;
+
+  // Body must either fallthrough to the latch or branch directly to it.
+  if (Instruction *BodyTerminator = CLI->getBody()->getTerminator()) {
+    auto *BodyBr = dyn_cast<BranchInst>(BodyTerminator);
+    if (!BodyBr || !BodyBr->isUnconditional() ||
+        BodyBr->getSuccessor(0) != CLI->getLatch()) {
+      return make_error<StringError>(
+          "iterator bodygen must terminate the canonical body with an "
+          "unconditional branch to the loop latch",
+          inconvertibleErrorCode());
+    }
+  } else {
+    // Ensure we end the loop body by jumping to the latch.
+    Builder.SetInsertPoint(CLI->getBody());
+    Builder.CreateBr(CLI->getLatch());
+  }
+
+  // Link After -> ContBB
+  Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
+  if (!CLI->getAfter()->getTerminator())
+    Builder.CreateBr(ContBB);
+
+  return InsertPointTy{ContBB, ContBB->begin()};
+}
+
+/// Mangle the parameter part of the vector function name according to
+/// their OpenMP classification. The mangling function is defined in
+/// section 4.5 of the AAVFABI(2021Q1).
+static std::string mangleVectorParameters(
+    ArrayRef<llvm::OpenMPIRBuilder::DeclareSimdAttrTy> ParamAttrs) {
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  for (const auto &ParamAttr : ParamAttrs) {
+    switch (ParamAttr.Kind) {
+    case llvm::OpenMPIRBuilder::DeclareSimdKindTy::Linear:
+      Out << 'l';
+      break;
+    case llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearRef:
+      Out << 'R';
+      break;
+    case llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearUVal:
+      Out << 'U';
+      break;
+    case llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearVal:
+      Out << 'L';
+      break;
+    case llvm::OpenMPIRBuilder::DeclareSimdKindTy::Uniform:
+      Out << 'u';
+      break;
+    case llvm::OpenMPIRBuilder::DeclareSimdKindTy::Vector:
+      Out << 'v';
+      break;
+    }
+    if (ParamAttr.HasVarStride)
+      Out << "s" << ParamAttr.StrideOrArg;
+    else if (ParamAttr.Kind ==
+                 llvm::OpenMPIRBuilder::DeclareSimdKindTy::Linear ||
+             ParamAttr.Kind ==
+                 llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearRef ||
+             ParamAttr.Kind ==
+                 llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearUVal ||
+             ParamAttr.Kind ==
+                 llvm::OpenMPIRBuilder::DeclareSimdKindTy::LinearVal) {
+      // Don't print the step value if it is not present or if it is
+      // equal to 1.
+      if (ParamAttr.StrideOrArg < 0)
+        Out << 'n' << -ParamAttr.StrideOrArg;
+      else if (ParamAttr.StrideOrArg != 1)
+        Out << ParamAttr.StrideOrArg;
+    }
+
+    if (!!ParamAttr.Alignment)
+      Out << 'a' << ParamAttr.Alignment;
+  }
+
+  return std::string(Out.str());
+}
+
+void OpenMPIRBuilder::emitX86DeclareSimdFunction(
+    llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
+    llvm::ArrayRef<DeclareSimdAttrTy> ParamAttrs, DeclareSimdBranch Branch) {
+  struct ISADataTy {
+    char ISA;
+    unsigned VecRegSize;
+  };
+  ISADataTy ISAData[] = {
+      {'b', 128}, // SSE
+      {'c', 256}, // AVX
+      {'d', 256}, // AVX2
+      {'e', 512}, // AVX512
+  };
+  llvm::SmallVector<char, 2> Masked;
+  switch (Branch) {
+  case DeclareSimdBranch::Undefined:
+    Masked.push_back('N');
+    Masked.push_back('M');
+    break;
+  case DeclareSimdBranch::Notinbranch:
+    Masked.push_back('N');
+    break;
+  case DeclareSimdBranch::Inbranch:
+    Masked.push_back('M');
+    break;
+  }
+  for (char Mask : Masked) {
+    for (const ISADataTy &Data : ISAData) {
+      llvm::SmallString<256> Buffer;
+      llvm::raw_svector_ostream Out(Buffer);
+      Out << "_ZGV" << Data.ISA << Mask;
+      if (!VLENVal) {
+        assert(NumElts && "Non-zero simdlen/cdtsize expected");
+        Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
+      } else {
+        Out << VLENVal;
+      }
+      Out << mangleVectorParameters(ParamAttrs);
+      Out << '_' << Fn->getName();
+      Fn->addFnAttr(Out.str());
+    }
+  }
+}
+
+// Function used to add the attribute. The parameter `VLEN` is templated to
+// allow the use of `x` when targeting scalable functions for SVE.
+template <typename T>
+static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
+                                 char ISA, StringRef ParSeq,
+                                 StringRef MangledName, bool OutputBecomesInput,
+                                 llvm::Function *Fn) {
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  Out << Prefix << ISA << LMask << VLEN;
+  if (OutputBecomesInput)
+    Out << 'v';
+  Out << ParSeq << '_' << MangledName;
+  Fn->addFnAttr(Out.str());
+}
+
+// Helper function to generate the Advanced SIMD names depending on the value
+// of the NDS when simdlen is not present.
+static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
+                                      StringRef Prefix, char ISA,
+                                      StringRef ParSeq, StringRef MangledName,
+                                      bool OutputBecomesInput,
+                                      llvm::Function *Fn) {
+  switch (NDS) {
+  case 8:
+    addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    break;
+  case 16:
+    addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    break;
+  case 32:
+    addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    break;
+  case 64:
+  case 128:
+    addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    break;
+  default:
+    llvm_unreachable("Scalar type is too wide.");
+  }
+}
+
+/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
+void OpenMPIRBuilder::emitAArch64DeclareSimdFunction(
+    llvm::Function *Fn, unsigned UserVLEN,
+    llvm::ArrayRef<DeclareSimdAttrTy> ParamAttrs, DeclareSimdBranch Branch,
+    char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
+  assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
+
+  // Sort out parameter sequence.
+  const std::string ParSeq = mangleVectorParameters(ParamAttrs);
+  StringRef Prefix = "_ZGV";
+  StringRef MangledName = Fn->getName();
+
+  // Generate simdlen from user input (if any).
+  if (UserVLEN) {
+    if (ISA == 's') {
+      // SVE generates only a masked function.
+      addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
+                           OutputBecomesInput, Fn);
+      return;
+    }
+
+    switch (Branch) {
+    case DeclareSimdBranch::Undefined:
+      addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
+                           OutputBecomesInput, Fn);
+      addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
+                           OutputBecomesInput, Fn);
+      break;
+    case DeclareSimdBranch::Inbranch:
+      addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
+                           OutputBecomesInput, Fn);
+      break;
+    case DeclareSimdBranch::Notinbranch:
+      addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
+                           OutputBecomesInput, Fn);
+      break;
+    }
+    return;
+  }
+
+  if (ISA == 's') {
+    // SVE, section 3.4.1, item 1.
+    addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
+                         OutputBecomesInput, Fn);
+    return;
+  }
+
+  switch (Branch) {
+  case DeclareSimdBranch::Undefined:
+    addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
+                              MangledName, OutputBecomesInput, Fn);
+    addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
+                              MangledName, OutputBecomesInput, Fn);
+    break;
+  case DeclareSimdBranch::Inbranch:
+    addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
+                              MangledName, OutputBecomesInput, Fn);
+    break;
+  case DeclareSimdBranch::Notinbranch:
+    addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
+                              MangledName, OutputBecomesInput, Fn);
+    break;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // OffloadEntriesInfoManager
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
index 4e4732ae08c1b..b5946b98bb12b 100644
--- a/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
+++ b/llvm/lib/FuzzMutate/RandomIRBuilder.cpp
@@ -21,11 +21,23 @@
 using namespace llvm;
 using namespace fuzzerop;
 
+static DominatorTree getDomTree(Function &F) {
+  // Dominator tree construction requires that all blocks have terminators.
+  SmallVector<Instruction *> AddedInsts;
+  for (BasicBlock &BB : F)
+    if (!BB.getTerminator())
+      AddedInsts.push_back(new UnreachableInst(F.getContext(), &BB));
+  DominatorTree DT(F);
+  for (Instruction *I : AddedInsts)
+    I->eraseFromParent();
+  return DT;
+}
+
 /// Return a vector of Blocks that dominates this block, excluding current
 /// block.
 static std::vector<BasicBlock *> getDominators(BasicBlock *BB) {
   std::vector<BasicBlock *> ret;
-  DominatorTree DT(*BB->getParent());
+  DominatorTree DT = getDomTree(*BB->getParent());
   DomTreeNode *Node = DT.getNode(BB);
   // It's possible that an orphan block is not in the dom tree. In that case we
   // just return nothing.
@@ -43,7 +55,7 @@ static std::vector<BasicBlock *> getDominators(BasicBlock *BB) {
 /// Return a vector of Blocks that is dominated by this block, excluding current
 /// block
 static std::vector<BasicBlock *> getDominatees(BasicBlock *BB) {
-  DominatorTree DT(*BB->getParent());
+  DominatorTree DT = getDomTree(*BB->getParent());
   std::vector<BasicBlock *> ret;
   DomTreeNode *Parent = DT.getNode(BB);
   // It's possible that an orphan block is not in the dom tree. In that case we
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 2147488d381f4..7bff1e307237c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -669,6 +669,9 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
     return;
   case Type::X86_AMXTyID:   OS << "x86_amx"; return;
   case Type::TokenTyID:     OS << "token"; return;
+  case Type::ByteTyID:
+    OS << 'b' << Ty->getByteBitWidth();
+    return;
   case Type::IntegerTyID:
     OS << 'i' << cast<IntegerType>(Ty)->getBitWidth();
     return;
@@ -1648,6 +1651,23 @@ static void writeConstantInternal(raw_ostream &Out, const Constant *CV,
     return;
   }
 
+  if (const auto *CB = dyn_cast<ConstantByte>(CV)) {
+    Type *Ty = CB->getType();
+
+    if (Ty->isVectorTy()) {
+      Out << "splat (";
+      WriterCtx.TypePrinter->print(Ty->getScalarType(), Out);
+      Out << " ";
+    }
+
+    Out << CB->getValue();
+
+    if (Ty->isVectorTy())
+      Out << ")";
+
+    return;
+  }
+
   if (const auto *CFP = dyn_cast<ConstantFP>(CV)) {
     Type *Ty = CFP->getType();
 
@@ -1773,7 +1793,8 @@ static void writeConstantInternal(raw_ostream &Out, const Constant *CV,
     // TODO: Remove this block when the UseConstant{Int,FP}ForFixedLengthSplat
     // options are removed.
     if (auto *SplatVal = CV->getSplatValue()) {
-      if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal)) {
+      if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal) ||
+          isa<ConstantByte>(SplatVal)) {
         Out << "splat (";
         writeAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
         Out << ')';
@@ -1820,7 +1841,8 @@ static void writeConstantInternal(raw_ostream &Out, const Constant *CV,
     // options are removed.
     if (CE->getOpcode() == Instruction::ShuffleVector) {
       if (auto *SplatVal = CE->getSplatValue()) {
-        if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal)) {
+        if (isa<ConstantInt>(SplatVal) || isa<ConstantFP>(SplatVal) ||
+            isa<ConstantByte>(SplatVal)) {
           Out << "splat (";
           writeAsOperandInternal(Out, SplatVal, WriterCtx, /*PrintType=*/true);
           Out << ')';
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 696bc6fffc035..4a073db5a589e 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2474,6 +2474,10 @@ AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, AttributeSet AS,
     // Attributes that only apply to integers.
     if (ASK & ASK_SAFE_TO_DROP)
       Incompatible.addAttribute(Attribute::AllocAlign);
+  }
+
+  if (!Ty->isIntegerTy() && !Ty->isByteTy()) {
+    // Attributes that only apply to integers and bytes.
     if (ASK & ASK_UNSAFE_TO_DROP)
       Incompatible.addAttribute(Attribute::SExt).addAttribute(Attribute::ZExt);
   }
diff --git a/llvm/lib/IR/CMakeLists.txt b/llvm/lib/IR/CMakeLists.txt
index 08bac979d4c90..ff362356fe65a 100644
--- a/llvm/lib/IR/CMakeLists.txt
+++ b/llvm/lib/IR/CMakeLists.txt
@@ -55,6 +55,7 @@ add_llvm_component_library(LLVMCore
   ModuleSummaryIndex.cpp
   Operator.cpp
   OptBisect.cpp
+  PatternMatch.cpp
   Pass.cpp
   PassInstrumentation.cpp
   PassManager.cpp
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 78ac276f4f3da..85b1121dd751e 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -46,7 +46,7 @@ static cl::opt<bool> UseConstantIntForScalableSplat(
     "use-constant-int-for-scalable-splat", cl::init(false), cl::Hidden,
     cl::desc("Use ConstantInt's native scalable vector splat support."));
 static cl::opt<bool> UseConstantFPForScalableSplat(
-    "use-constant-fp-for-scalable-splat", cl::init(false), cl::Hidden,
+    "use-constant-fp-for-scalable-splat", cl::init(true), cl::Hidden,
     cl::desc("Use ConstantFP's native scalable vector splat support."));
 
 //===----------------------------------------------------------------------===//
@@ -76,6 +76,10 @@ bool Constant::isNullValue() const {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return CI->isZero();
 
+  // 0 is null.
+  if (const ConstantByte *CB = dyn_cast<ConstantByte>(this))
+    return CB->isZero();
+
   // +0.0 is null.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     // ppc_fp128 determine isZero using high order double only
@@ -93,6 +97,10 @@ bool Constant::isAllOnesValue() const {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return CI->isMinusOne();
 
+  // Check for MaxValue bytes
+  if (const ConstantByte *CB = dyn_cast<ConstantByte>(this))
+    return CB->isMinusOne();
+
   // Check for FP which are bitcasted from -1 integers
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->getValueAPF().bitcastToAPInt().isAllOnes();
@@ -110,6 +118,10 @@ bool Constant::isOneValue() const {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return CI->isOne();
 
+  // Check for 1 bytes
+  if (const ConstantByte *CB = dyn_cast<ConstantByte>(this))
+    return CB->isOne();
+
   // Check for FP which are bitcasted from 1 integers
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return CFP->getValueAPF().bitcastToAPInt().isOne();
@@ -127,6 +139,10 @@ bool Constant::isNotOneValue() const {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return !CI->isOneValue();
 
+  // Check for 1 bytes
+  if (const ConstantByte *CB = dyn_cast<ConstantByte>(this))
+    return !CB->isOneValue();
+
   // Check for FP which are bitcasted from 1 integers
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(this))
     return !CFP->getValueAPF().bitcastToAPInt().isOne();
@@ -373,6 +389,8 @@ bool Constant::containsConstantExpression() const {
 /// Constructor to create a '0' constant of arbitrary type.
 Constant *Constant::getNullValue(Type *Ty) {
   switch (Ty->getTypeID()) {
+  case Type::ByteTyID:
+    return ConstantByte::get(Ty, 0);
   case Type::IntegerTyID:
     return ConstantInt::get(Ty, 0);
   case Type::HalfTyID:
@@ -411,6 +429,10 @@ Constant *Constant::getIntegerValue(Type *Ty, const APInt &V) {
   if (PointerType *PTy = dyn_cast<PointerType>(ScalarTy))
     C = ConstantExpr::getIntToPtr(C, PTy);
 
+  // Convert an integer to a byte, if necessary.
+  if (ByteType *BTy = dyn_cast<ByteType>(ScalarTy))
+    C = ConstantExpr::getBitCast(C, BTy);
+
   // Broadcast a scalar to a vector, if necessary.
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
     C = ConstantVector::getSplat(VTy->getElementCount(), C);
@@ -428,6 +450,10 @@ Constant *Constant::getAllOnesValue(Type *Ty) {
     return ConstantFP::get(Ty->getContext(), FL);
   }
 
+  if (ByteType *BTy = dyn_cast<ByteType>(Ty))
+    return ConstantByte::get(Ty->getContext(),
+                             APInt::getAllOnes(BTy->getBitWidth()));
+
   VectorType *VTy = cast<VectorType>(Ty);
   return ConstantVector::getSplat(VTy->getElementCount(),
                                   getAllOnesValue(VTy->getElementType()));
@@ -452,6 +478,13 @@ Constant *Constant::getAggregateElement(unsigned Elt) const {
                ? ConstantInt::get(getContext(), CI->getValue())
                : nullptr;
 
+  if (const auto *CB = dyn_cast<ConstantByte>(this))
+    return Elt < cast<VectorType>(getType())
+                       ->getElementCount()
+                       .getKnownMinValue()
+               ? ConstantByte::get(getContext(), CB->getValue())
+               : nullptr;
+
   if (const auto *CFP = dyn_cast<ConstantFP>(this))
     return Elt < cast<VectorType>(getType())
                        ->getElementCount()
@@ -532,6 +565,9 @@ void llvm::deleteConstant(Constant *C) {
   case Constant::ConstantIntVal:
     delete static_cast<ConstantInt *>(C);
     break;
+  case Constant::ConstantByteVal:
+    delete static_cast<ConstantByte *>(C);
+    break;
   case Constant::ConstantFPVal:
     delete static_cast<ConstantFP *>(C);
     break;
@@ -983,6 +1019,93 @@ void ConstantInt::destroyConstantImpl() {
   llvm_unreachable("You can't ConstantInt->destroyConstantImpl()!");
 }
 
+//===----------------------------------------------------------------------===//
+//                               ConstantByte
+//===----------------------------------------------------------------------===//
+
+ConstantByte::ConstantByte(Type *Ty, const APInt &V)
+    : ConstantData(Ty, ConstantByteVal), Val(V) {
+  assert(V.getBitWidth() ==
+             cast<ByteType>(Ty->getScalarType())->getBitWidth() &&
+         "Invalid constant for type");
+}
+
+// Get a ConstantByte from an APInt.
+ConstantByte *ConstantByte::get(LLVMContext &Context, const APInt &V) {
+  // get an existing value or the insertion position
+  LLVMContextImpl *pImpl = Context.pImpl;
+  std::unique_ptr<ConstantByte> &Slot =
+      V.isZero()  ? pImpl->ByteZeroConstants[V.getBitWidth()]
+      : V.isOne() ? pImpl->ByteOneConstants[V.getBitWidth()]
+                  : pImpl->ByteConstants[V];
+  if (!Slot) {
+    // Get the corresponding byte type for the bit width of the value.
+    ByteType *BTy = ByteType::get(Context, V.getBitWidth());
+    Slot.reset(new ConstantByte(BTy, V));
+  }
+  assert(Slot->getType() == ByteType::get(Context, V.getBitWidth()));
+  return Slot.get();
+}
+
+// Get a ConstantByte vector with each lane set to the same APInt.
+ConstantByte *ConstantByte::get(LLVMContext &Context, ElementCount EC,
+                                const APInt &V) {
+  // Get an existing value or the insertion position.
+  std::unique_ptr<ConstantByte> &Slot =
+      Context.pImpl->ByteSplatConstants[std::make_pair(EC, V)];
+  if (!Slot) {
+    ByteType *BTy = ByteType::get(Context, V.getBitWidth());
+    VectorType *VTy = VectorType::get(BTy, EC);
+    Slot.reset(new ConstantByte(VTy, V));
+  }
+
+#ifndef NDEBUG
+  ByteType *BTy = ByteType::get(Context, V.getBitWidth());
+  VectorType *VTy = VectorType::get(BTy, EC);
+  assert(Slot->getType() == VTy);
+#endif
+  return Slot.get();
+}
+
+Constant *ConstantByte::get(Type *Ty, uint64_t V, bool isSigned,
+                            bool ImplicitTrunc) {
+  Constant *C =
+      get(cast<ByteType>(Ty->getScalarType()), V, isSigned, ImplicitTrunc);
+
+  // For vectors, broadcast the value.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
+
+  return C;
+}
+
+ConstantByte *ConstantByte::get(ByteType *Ty, uint64_t V, bool isSigned,
+                                bool ImplicitTrunc) {
+  return get(Ty->getContext(),
+             APInt(Ty->getBitWidth(), V, isSigned, ImplicitTrunc));
+}
+
+Constant *ConstantByte::get(Type *Ty, const APInt &V) {
+  ConstantByte *C = get(Ty->getContext(), V);
+  assert(C->getType() == Ty->getScalarType() &&
+         "ConstantByte type doesn't match the type implied by its value!");
+
+  // For vectors, broadcast the value.
+  if (VectorType *VTy = dyn_cast<VectorType>(Ty))
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
+
+  return C;
+}
+
+ConstantByte *ConstantByte::get(ByteType *Ty, StringRef Str, uint8_t radix) {
+  return get(Ty->getContext(), APInt(Ty->getBitWidth(), Str, radix));
+}
+
+/// Remove the constant from the constant table.
+void ConstantByte::destroyConstantImpl() {
+  llvm_unreachable("You can't ConstantByte->destroyConstantImpl()!");
+}
+
 //===----------------------------------------------------------------------===//
 //                                ConstantFP
 //===----------------------------------------------------------------------===//
@@ -1253,6 +1376,19 @@ static Constant *getIntSequenceIfElementsMatch(ArrayRef<Constant *> V) {
   return SequentialTy::get(V[0]->getContext(), Elts);
 }
 
+template <typename SequentialTy, typename ElementTy>
+static Constant *getByteSequenceIfElementsMatch(ArrayRef<Constant *> V) {
+  assert(!V.empty() && "Cannot get empty byte sequence.");
+
+  SmallVector<ElementTy, 16> Elts;
+  for (Constant *C : V)
+    if (auto *CI = dyn_cast<ConstantByte>(C))
+      Elts.push_back(CI->getZExtValue());
+    else
+      return nullptr;
+  return SequentialTy::getByte(V[0]->getType(), Elts);
+}
+
 template <typename SequentialTy, typename ElementTy>
 static Constant *getFPSequenceIfElementsMatch(ArrayRef<Constant *> V) {
   assert(!V.empty() && "Cannot get empty FP sequence.");
@@ -1281,6 +1417,15 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
       return getIntSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
     else if (CI->getType()->isIntegerTy(64))
       return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
+  } else if (ConstantByte *CB = dyn_cast<ConstantByte>(C)) {
+    if (CB->getType()->isByteTy(8))
+      return getByteSequenceIfElementsMatch<SequenceTy, uint8_t>(V);
+    else if (CB->getType()->isByteTy(16))
+      return getByteSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
+    else if (CB->getType()->isByteTy(32))
+      return getByteSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
+    else if (CB->getType()->isByteTy(64))
+      return getByteSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
   } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
     if (CFP->getType()->isHalfTy() || CFP->getType()->isBFloatTy())
       return getFPSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
@@ -1346,8 +1491,9 @@ Constant *ConstantArray::getImpl(ArrayType *Ty, ArrayRef<Constant*> V) {
   if (C->isNullValue() && rangeOnlyContains(V.begin(), V.end(), C))
     return ConstantAggregateZero::get(Ty);
 
-  // Check to see if all of the elements are ConstantFP or ConstantInt and if
-  // the element type is compatible with ConstantDataVector.  If so, use it.
+  // Check to see if all of the elements are ConstantFP or ConstantInt or
+  // ConstantByte and if the element type is compatible with ConstantDataVector.
+  // If so, use it.
   if (ConstantDataSequential::isElementTypeCompatible(C->getType()))
     return getSequenceIfElementsMatch<ConstantDataArray>(C, V);
 
@@ -1444,11 +1590,13 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   bool isPoison = isa<PoisonValue>(C);
   bool isSplatFP = UseConstantFPForFixedLengthSplat && isa<ConstantFP>(C);
   bool isSplatInt = UseConstantIntForFixedLengthSplat && isa<ConstantInt>(C);
+  bool isSplatByte = isa<ConstantByte>(C);
 
-  if (isZero || isUndef || isSplatFP || isSplatInt) {
+  if (isZero || isUndef || isSplatFP || isSplatInt || isSplatByte) {
     for (unsigned i = 1, e = V.size(); i != e; ++i)
       if (V[i] != C) {
-        isZero = isUndef = isPoison = isSplatFP = isSplatInt = false;
+        isZero = isUndef = isPoison = isSplatFP = isSplatInt = isSplatByte =
+            false;
         break;
       }
   }
@@ -1465,6 +1613,9 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   if (isSplatInt)
     return ConstantInt::get(C->getContext(), T->getElementCount(),
                             cast<ConstantInt>(C)->getValue());
+  if (isSplatByte)
+    return ConstantByte::get(C->getContext(), T->getElementCount(),
+                             cast<ConstantByte>(C)->getValue());
 
   // Check to see if all of the elements are ConstantFP or ConstantInt and if
   // the element type is compatible with ConstantDataVector.  If so, use it.
@@ -1483,6 +1634,9 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
       if (UseConstantIntForFixedLengthSplat && isa<ConstantInt>(V))
         return ConstantInt::get(V->getContext(), EC,
                                 cast<ConstantInt>(V)->getValue());
+      if (isa<ConstantByte>(V))
+        return ConstantByte::get(V->getContext(), EC,
+                                 cast<ConstantByte>(V)->getValue());
       if (UseConstantFPForFixedLengthSplat && isa<ConstantFP>(V))
         return ConstantFP::get(V->getContext(), EC,
                                cast<ConstantFP>(V)->getValue());
@@ -1490,7 +1644,7 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
 
     // If this splat is compatible with ConstantDataVector, use it instead of
     // ConstantVector.
-    if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
+    if ((isa<ConstantFP>(V) || isa<ConstantInt>(V) || isa<ConstantByte>(V)) &&
         ConstantDataSequential::isElementTypeCompatible(V->getType()))
       return ConstantDataVector::getSplat(EC.getKnownMinValue(), V);
 
@@ -1503,6 +1657,9 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
     if (UseConstantIntForScalableSplat && isa<ConstantInt>(V))
       return ConstantInt::get(V->getContext(), EC,
                               cast<ConstantInt>(V)->getValue());
+    if (isa<ConstantByte>(V))
+      return ConstantByte::get(V->getContext(), EC,
+                               cast<ConstantByte>(V)->getValue());
     if (UseConstantFPForScalableSplat && isa<ConstantFP>(V))
       return ConstantFP::get(V->getContext(), EC,
                              cast<ConstantFP>(V)->getValue());
@@ -1723,6 +1880,8 @@ Constant *Constant::getSplatValue(bool AllowPoison) const {
     return getNullValue(cast<VectorType>(getType())->getElementType());
   if (auto *CI = dyn_cast<ConstantInt>(this))
     return ConstantInt::get(getContext(), CI->getValue());
+  if (auto *CB = dyn_cast<ConstantByte>(this))
+    return ConstantByte::get(getContext(), CB->getValue());
   if (auto *CFP = dyn_cast<ConstantFP>(this))
     return ConstantFP::get(getContext(), CFP->getValue());
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
@@ -1782,6 +1941,8 @@ Constant *ConstantVector::getSplatValue(bool AllowPoison) const {
 const APInt &Constant::getUniqueInteger() const {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(this))
     return CI->getValue();
+  if (const ConstantByte *CB = dyn_cast<ConstantByte>(this))
+    return CB->getValue();
   // Scalable vectors can use a ConstantExpr to build a splat.
   if (isa<ConstantExpr>(this))
     return cast<ConstantInt>(this->getSplatValue())->getValue();
@@ -1805,6 +1966,10 @@ ConstantRange Constant::toConstantRange() const {
           getSplatValue(/*AllowPoison=*/true)))
     return ConstantRange(CI->getValue());
 
+  if (auto *CB =
+          dyn_cast_or_null<ConstantByte>(getSplatValue(/*AllowPoison=*/true)))
+    return ConstantRange(CB->getValue());
+
   if (auto *CDV = dyn_cast<ConstantDataVector>(this)) {
     ConstantRange CR = ConstantRange::getEmpty(BitWidth);
     for (unsigned I = 0, E = CDV->getNumElements(); I < E; ++I)
@@ -1821,7 +1986,8 @@ ConstantRange Constant::toConstantRange() const {
       if (isa<PoisonValue>(Elem))
         continue;
       auto *CI = dyn_cast<ConstantInt>(Elem);
-      if (!CI)
+      auto *CB = dyn_cast<ConstantByte>(Elem);
+      if (!CI && !CB)
         return ConstantRange::getFull(BitWidth);
       CR = CR.unionWith(CI->getValue());
     }
@@ -2868,6 +3034,17 @@ bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
     default: break;
     }
   }
+  if (auto *IT = dyn_cast<ByteType>(Ty)) {
+    switch (IT->getBitWidth()) {
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+      return true;
+    default:
+      break;
+    }
+  }
   return false;
 }
 
@@ -3000,17 +3177,54 @@ Constant *ConstantDataArray::getFP(Type *ElementType, ArrayRef<uint64_t> Elts) {
   return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 
-Constant *ConstantDataArray::getString(LLVMContext &Context,
-                                       StringRef Str, bool AddNull) {
+/// getByte() constructors - Return a constant of array type with a byte
+/// element type taken from argument `ElementType', and count taken from
+/// argument `Elts'.  The amount of bits of the contained type must match the
+/// number of bits of the type contained in the passed in ArrayRef.
+/// Note that this can return a ConstantAggregateZero object.
+Constant *ConstantDataArray::getByte(Type *ElementType,
+                                     ArrayRef<uint8_t> Elts) {
+  assert(ElementType->isByteTy(8) && "Element type is not a 8-bit byte type");
+  Type *Ty = ArrayType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 1), Ty);
+}
+Constant *ConstantDataArray::getByte(Type *ElementType,
+                                     ArrayRef<uint16_t> Elts) {
+  assert(ElementType->isByteTy(16) && "Element type is not a 16-bit byte type");
+  Type *Ty = ArrayType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 2), Ty);
+}
+Constant *ConstantDataArray::getByte(Type *ElementType,
+                                     ArrayRef<uint32_t> Elts) {
+  assert(ElementType->isByteTy(32) && "Element type is not a 32-bit byte type");
+  Type *Ty = ArrayType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
+}
+Constant *ConstantDataArray::getByte(Type *ElementType,
+                                     ArrayRef<uint64_t> Elts) {
+  assert(ElementType->isByteTy(64) && "Element type is not a 64-bit byte type");
+  Type *Ty = ArrayType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
+}
+
+Constant *ConstantDataArray::getString(LLVMContext &Context, StringRef Str,
+                                       bool AddNull, bool ByteString) {
   if (!AddNull) {
     const uint8_t *Data = Str.bytes_begin();
-    return get(Context, ArrayRef(Data, Str.size()));
+    return ByteString
+               ? getByte(Type::getByte8Ty(Context), ArrayRef(Data, Str.size()))
+               : get(Context, ArrayRef(Data, Str.size()));
   }
 
   SmallVector<uint8_t, 64> ElementVals;
   ElementVals.append(Str.begin(), Str.end());
   ElementVals.push_back(0);
-  return get(Context, ElementVals);
+  return ByteString ? getByte(Type::getByte8Ty(Context), ElementVals)
+                    : get(Context, ElementVals);
 }
 
 /// get() constructors - Return a constant with vector type with an element
@@ -3047,6 +3261,40 @@ Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) {
   return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 
+/// getByte() constructors - Return a constant of vector type with a byte
+/// element type taken from argument `ElementType', and count taken from
+/// argument `Elts'.  The amount of bits of the contained type must match the
+/// number of bits of the type contained in the passed in ArrayRef.
+/// Note that this can return a ConstantAggregateZero object.
+Constant *ConstantDataVector::getByte(Type *ElementType,
+                                      ArrayRef<uint8_t> Elts) {
+  assert(ElementType->isByteTy(8) && "Element type is not a 8-bit byte");
+  auto *Ty = FixedVectorType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 1), Ty);
+}
+Constant *ConstantDataVector::getByte(Type *ElementType,
+                                      ArrayRef<uint16_t> Elts) {
+  assert(ElementType->isByteTy(16) && "Element type is not a 16-bit byte");
+  auto *Ty = FixedVectorType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 2), Ty);
+}
+Constant *ConstantDataVector::getByte(Type *ElementType,
+                                      ArrayRef<uint32_t> Elts) {
+  assert(ElementType->isByteTy(32) && "Element type is not a 32-bit byte");
+  auto *Ty = FixedVectorType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 4), Ty);
+}
+Constant *ConstantDataVector::getByte(Type *ElementType,
+                                      ArrayRef<uint64_t> Elts) {
+  assert(ElementType->isByteTy(64) && "Element type is not a 64-bit byte");
+  auto *Ty = FixedVectorType::get(ElementType, Elts.size());
+  const char *Data = reinterpret_cast<const char *>(Elts.data());
+  return getImpl(StringRef(Data, Elts.size() * 8), Ty);
+}
+
 /// getFP() constructors - Return a constant of vector type with a float
 /// element type taken from argument `ElementType', and count taken from
 /// argument `Elts'.  The amount of bits of the contained type must match the
@@ -3098,6 +3346,24 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
     return get(V->getContext(), Elts);
   }
 
+  if (ConstantByte *CB = dyn_cast<ConstantByte>(V)) {
+    if (CB->getType()->isByteTy(8)) {
+      SmallVector<uint8_t, 16> Elts(NumElts, CB->getZExtValue());
+      return getByte(V->getType(), Elts);
+    }
+    if (CB->getType()->isByteTy(16)) {
+      SmallVector<uint16_t, 16> Elts(NumElts, CB->getZExtValue());
+      return getByte(V->getType(), Elts);
+    }
+    if (CB->getType()->isByteTy(32)) {
+      SmallVector<uint32_t, 16> Elts(NumElts, CB->getZExtValue());
+      return getByte(V->getType(), Elts);
+    }
+    assert(CB->getType()->isByteTy(64) && "Unsupported ConstantData type");
+    SmallVector<uint64_t, 16> Elts(NumElts, CB->getZExtValue());
+    return getByte(V->getType(), Elts);
+  }
+
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType()->isHalfTy()) {
       SmallVector<uint16_t, 16> Elts(
@@ -3124,13 +3390,14 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
 }
 
 uint64_t ConstantDataSequential::getElementAsInteger(uint64_t Elt) const {
-  assert(isa<IntegerType>(getElementType()) &&
-         "Accessor can only be used when element is an integer");
+  assert(
+      (isa<IntegerType>(getElementType()) || isa<ByteType>(getElementType())) &&
+      "Accessor can only be used when element is an integer or byte");
   const char *EltPtr = getElementPointer(Elt);
 
   // The data is stored in host byte order, make sure to cast back to the right
   // type to load with the right endianness.
-  switch (getElementType()->getIntegerBitWidth()) {
+  switch (getElementType()->getScalarSizeInBits()) {
   default: llvm_unreachable("Invalid bitwidth for CDS");
   case 8:
     return *reinterpret_cast<const uint8_t *>(EltPtr);
@@ -3144,13 +3411,14 @@ uint64_t ConstantDataSequential::getElementAsInteger(uint64_t Elt) const {
 }
 
 APInt ConstantDataSequential::getElementAsAPInt(uint64_t Elt) const {
-  assert(isa<IntegerType>(getElementType()) &&
-         "Accessor can only be used when element is an integer");
+  assert(
+      (isa<IntegerType>(getElementType()) || isa<ByteType>(getElementType())) &&
+      "Accessor can only be used when element is an integer or byte");
   const char *EltPtr = getElementPointer(Elt);
 
   // The data is stored in host byte order, make sure to cast back to the right
   // type to load with the right endianness.
-  switch (getElementType()->getIntegerBitWidth()) {
+  switch (getElementType()->getScalarSizeInBits()) {
   default: llvm_unreachable("Invalid bitwidth for CDS");
   case 8: {
     auto EltVal = *reinterpret_cast<const uint8_t *>(EltPtr);
@@ -3213,11 +3481,16 @@ Constant *ConstantDataSequential::getElementAsConstant(uint64_t Elt) const {
       getElementType()->isFloatTy() || getElementType()->isDoubleTy())
     return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
 
+  if (getElementType()->isByteTy())
+    return ConstantByte::get(getElementType(), getElementAsInteger(Elt));
+
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
 }
 
 bool ConstantDataSequential::isString(unsigned CharSize) const {
-  return isa<ArrayType>(getType()) && getElementType()->isIntegerTy(CharSize);
+  return isa<ArrayType>(getType()) &&
+         (getElementType()->isIntegerTy(CharSize) ||
+          getElementType()->isByteTy(CharSize));
 }
 
 bool ConstantDataSequential::isCString() const {
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 65add3415b3bf..f91439f410eaa 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -621,6 +621,8 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMLabelTypeKind;
   case Type::MetadataTyID:
     return LLVMMetadataTypeKind;
+  case Type::ByteTyID:
+    return LLVMByteTypeKind;
   case Type::IntegerTyID:
     return LLVMIntegerTypeKind;
   case Type::FunctionTyID:
@@ -672,6 +674,16 @@ char *LLVMPrintTypeToString(LLVMTypeRef Ty) {
   return strdup(buf.c_str());
 }
 
+/*--.. Operations on byte types ............................................--*/
+
+LLVMTypeRef LLVMByteTypeInContext(LLVMContextRef C, unsigned NumBits) {
+  return wrap(ByteType::get(*unwrap(C), NumBits));
+}
+
+unsigned LLVMGetByteTypeWidth(LLVMTypeRef ByteTy) {
+  return unwrap<ByteType>(ByteTy)->getBitWidth();
+}
+
 /*--.. Operations on integer types .........................................--*/
 
 LLVMTypeRef LLVMInt1TypeInContext(LLVMContextRef C)  {
@@ -1572,6 +1584,30 @@ LLVMValueRef LLVMConstIntOfStringAndSize(LLVMTypeRef IntTy, const char Str[],
                                Radix));
 }
 
+LLVMValueRef LLVMConstByte(LLVMTypeRef ByteTy, unsigned long long N) {
+  return wrap(ConstantByte::get(unwrap<ByteType>(ByteTy), N));
+}
+
+LLVMValueRef LLVMConstByteOfArbitraryPrecision(LLVMTypeRef ByteTy,
+                                               unsigned NumWords,
+                                               const uint64_t Words[]) {
+  ByteType *Ty = unwrap<ByteType>(ByteTy);
+  return wrap(ConstantByte::get(
+      Ty->getContext(), APInt(Ty->getBitWidth(), ArrayRef(Words, NumWords))));
+}
+
+LLVMValueRef LLVMConstByteOfString(LLVMTypeRef ByteTy, const char Str[],
+                                   uint8_t Radix) {
+  return wrap(
+      ConstantByte::get(unwrap<ByteType>(ByteTy), StringRef(Str), Radix));
+}
+
+LLVMValueRef LLVMConstByteOfStringAndSize(LLVMTypeRef ByteTy, const char Str[],
+                                          size_t SLen, uint8_t Radix) {
+  return wrap(
+      ConstantByte::get(unwrap<ByteType>(ByteTy), StringRef(Str, SLen), Radix));
+}
+
 LLVMValueRef LLVMConstReal(LLVMTypeRef RealTy, double N) {
   return wrap(ConstantFP::get(unwrap(RealTy), N));
 }
@@ -1601,6 +1637,14 @@ long long LLVMConstIntGetSExtValue(LLVMValueRef ConstantVal) {
   return unwrap<ConstantInt>(ConstantVal)->getSExtValue();
 }
 
+unsigned long long LLVMConstByteGetZExtValue(LLVMValueRef ConstantVal) {
+  return unwrap<ConstantByte>(ConstantVal)->getZExtValue();
+}
+
+long long LLVMConstByteGetSExtValue(LLVMValueRef ConstantVal) {
+  return unwrap<ConstantByte>(ConstantVal)->getSExtValue();
+}
+
 double LLVMConstRealGetDouble(LLVMValueRef ConstantVal, LLVMBool *LosesInfo) {
   ConstantFP *cFP = unwrap<ConstantFP>(ConstantVal) ;
   Type *Ty = cFP->getType();
@@ -3486,7 +3530,7 @@ LLVMValueRef LLVMBuildRet(LLVMBuilderRef B, LLVMValueRef V) {
 
 LLVMValueRef LLVMBuildAggregateRet(LLVMBuilderRef B, LLVMValueRef *RetVals,
                                    unsigned N) {
-  return wrap(unwrap(B)->CreateAggregateRet(unwrap(RetVals), N));
+  return wrap(unwrap(B)->CreateAggregateRet({unwrap(RetVals), N}));
 }
 
 LLVMValueRef LLVMBuildBr(LLVMBuilderRef B, LLVMBasicBlockRef Dest) {
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 22b489110d064..758f49f60c3f1 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -874,6 +874,9 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     const Align Align = abi_or_pref ? StructABIAlignment : StructPrefAlignment;
     return std::max(Align, Layout->getAlignment());
   }
+  case Type::ByteTyID:
+    // The byte type has the same alignment as the equally sized integer type.
+    return getIntegerAlignment(Ty->getByteBitWidth(), abi_or_pref);
   case Type::IntegerTyID:
     return getIntegerAlignment(Ty->getIntegerBitWidth(), abi_or_pref);
   case Type::HalfTyID:
@@ -988,6 +991,21 @@ Type *DataLayout::getIntPtrType(Type *Ty) const {
   return IntTy;
 }
 
+ByteType *DataLayout::getBytePtrType(LLVMContext &C,
+                                     unsigned AddressSpace) const {
+  return ByteType::get(C, getPointerSizeInBits(AddressSpace));
+}
+
+Type *DataLayout::getBytePtrType(Type *Ty) const {
+  assert(Ty->isPtrOrPtrVectorTy() &&
+         "Expected a pointer or pointer vector type.");
+  unsigned NumBits = getPointerTypeSizeInBits(Ty);
+  ByteType *ByteTy = ByteType::get(Ty->getContext(), NumBits);
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+    return VectorType::get(ByteTy, VecTy);
+  return ByteTy;
+}
+
 Type *DataLayout::getSmallestLegalIntType(LLVMContext &C, unsigned Width) const {
   for (unsigned LegalIntWidth : LegalIntWidths)
     if (Width <= LegalIntWidth)
diff --git a/llvm/lib/IR/Dominators.cpp b/llvm/lib/IR/Dominators.cpp
index 7bd1cef88dbaf..e11b80920e7ff 100644
--- a/llvm/lib/IR/Dominators.cpp
+++ b/llvm/lib/IR/Dominators.cpp
@@ -49,18 +49,6 @@ static constexpr bool ExpensiveChecksEnabled = true;
 static constexpr bool ExpensiveChecksEnabled = false;
 #endif
 
-bool BasicBlockEdge::isSingleEdge() const {
-  unsigned NumEdgesToEnd = 0;
-  for (const BasicBlock *Succ : successors(Start)) {
-    if (Succ == End)
-      ++NumEdgesToEnd;
-    if (NumEdgesToEnd >= 2)
-      return false;
-  }
-  assert(NumEdgesToEnd == 1);
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 //  DominatorTree Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index e324528298532..5205d36a228c1 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1352,11 +1352,24 @@ void Instruction::setSuccessor(unsigned idx, BasicBlock *B) {
   llvm_unreachable("not a terminator");
 }
 
+iterator_range<Instruction::const_succ_iterator>
+Instruction::successors() const {
+  switch (getOpcode()) {
+#define HANDLE_TERM_INST(N, OPC, CLASS)                                        \
+  case Instruction::OPC:                                                       \
+    return static_cast<const CLASS *>(this)->successors();
+#include "llvm/IR/Instruction.def"
+  default:
+    break;
+  }
+  llvm_unreachable("not a terminator");
+}
+
 void Instruction::replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB) {
-  for (unsigned Idx = 0, NumSuccessors = Instruction::getNumSuccessors();
-       Idx != NumSuccessors; ++Idx)
-    if (getSuccessor(Idx) == OldBB)
-      setSuccessor(Idx, NewBB);
+  auto Succs = successors();
+  for (auto I = Succs.begin(), E = Succs.end(); I != E; ++I)
+    if (*I == OldBB)
+      I.getUse()->set(NewBB);
 }
 
 Instruction *Instruction::cloneImpl() const {
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 4c00ee29ac782..0485af358e2c4 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1218,8 +1218,8 @@ CondBrInst::CondBrInst(Value *Cond, BasicBlock *IfTrue, BasicBlock *IfFalse,
                  AllocMarker, InsertBefore) {
   // Assign in order of operand index to make use-list order predictable.
   Op<-3>() = Cond;
-  Op<-2>() = IfFalse;
-  Op<-1>() = IfTrue;
+  Op<-2>() = IfTrue;
+  Op<-1>() = IfFalse;
 #ifndef NDEBUG
   AssertOK();
 #endif
@@ -3242,7 +3242,16 @@ CastInst::getCastOpcode(
       DestTy->getPrimitiveSizeInBits().getFixedValue(); // 0 for ptr
 
   // Run through the possibilities ...
-  if (DestTy->isIntegerTy()) {                      // Casting to integral
+  if (DestTy->isByteTy()) {     // Casting to byte
+    if (SrcTy->isIntegerTy()) { // Casting from integral
+      assert(DestBits == SrcBits && "Illegal cast from integer to byte type");
+      return BitCast;
+    } else if (SrcTy->isPointerTy()) { // Casting from pointer
+      assert(DestBits == SrcBits && "Illegal cast from pointer to byte type");
+      return BitCast;
+    }
+    llvm_unreachable("Illegal cast to byte type");
+  } else if (DestTy->isIntegerTy()) {               // Casting to integral
     if (SrcTy->isIntegerTy()) {                     // Casting from integral
       if (DestBits < SrcBits)
         return Trunc;                               // int -> smaller int
@@ -3374,7 +3383,10 @@ CastInst::castIsValid(Instruction::CastOps op, Type *SrcTy, Type *DstTy) {
     PointerType *DstPtrTy = dyn_cast<PointerType>(DstTy->getScalarType());
 
     // BitCast implies a no-op cast of type only. No bits change.
-    // However, you can't cast pointers to anything but pointers.
+    // However, you can't cast pointers to anything but pointers/bytes.
+    if ((SrcPtrTy && DstTy->isByteOrByteVectorTy()) ||
+        (SrcTy->isByteOrByteVectorTy() && DstPtrTy))
+      return true;
     if (!SrcPtrTy != !DstPtrTy)
       return false;
 
@@ -3809,22 +3821,6 @@ CmpInst::Predicate CmpInst::getFlippedStrictnessPredicate(Predicate pred) {
   llvm_unreachable("Unknown predicate!");
 }
 
-bool CmpInst::isUnsigned(Predicate predicate) {
-  switch (predicate) {
-    default: return false;
-    case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: case ICmpInst::ICMP_UGT:
-    case ICmpInst::ICMP_UGE: return true;
-  }
-}
-
-bool CmpInst::isSigned(Predicate predicate) {
-  switch (predicate) {
-    default: return false;
-    case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_SLE: case ICmpInst::ICMP_SGT:
-    case ICmpInst::ICMP_SGE: return true;
-  }
-}
-
 bool ICmpInst::compare(const APInt &LHS, const APInt &RHS,
                        ICmpInst::Predicate Pred) {
   assert(ICmpInst::isIntPredicate(Pred) && "Only for integer predicates!");
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 186bd1edb8c52..f2c6921bbb7e0 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -146,6 +146,9 @@ static std::string getMangledTypeStr(Type *Ty, bool &HasUnnamedType) {
     case Type::IntegerTyID:
       Result += "i" + utostr(cast<IntegerType>(Ty)->getBitWidth());
       break;
+    case Type::ByteTyID:
+      Result += "b" + utostr(cast<ByteType>(Ty)->getBitWidth());
+      break;
     }
   }
   return Result;
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 8f79398b086eb..9bb14178234db 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -37,7 +37,8 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
       X86_FP80Ty(C, Type::X86_FP80TyID), FP128Ty(C, Type::FP128TyID),
       PPC_FP128Ty(C, Type::PPC_FP128TyID), X86_AMXTy(C, Type::X86_AMXTyID),
       Int1Ty(C, 1), Int8Ty(C, 8), Int16Ty(C, 16), Int32Ty(C, 32),
-      Int64Ty(C, 64), Int128Ty(C, 128) {}
+      Int64Ty(C, 64), Int128Ty(C, 128), Byte1Ty(C, 1), Byte8Ty(C, 8),
+      Byte16Ty(C, 16), Byte32Ty(C, 32), Byte64Ty(C, 64), Byte128Ty(C, 128) {}
 
 LLVMContextImpl::~LLVMContextImpl() {
 #ifndef NDEBUG
@@ -119,6 +120,10 @@ LLVMContextImpl::~LLVMContextImpl() {
   IntOneConstants.clear();
   IntConstants.clear();
   IntSplatConstants.clear();
+  ByteZeroConstants.clear();
+  ByteOneConstants.clear();
+  ByteConstants.clear();
+  ByteSplatConstants.clear();
   FPConstants.clear();
   FPSplatConstants.clear();
   CDSConstants.clear();
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 2c9921df0422e..85aaecede4795 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1674,6 +1674,12 @@ class LLVMContextImpl {
   DenseMap<std::pair<ElementCount, APInt>, std::unique_ptr<ConstantInt>>
       IntSplatConstants;
 
+  DenseMap<unsigned, std::unique_ptr<ConstantByte>> ByteZeroConstants;
+  DenseMap<unsigned, std::unique_ptr<ConstantByte>> ByteOneConstants;
+  DenseMap<APInt, std::unique_ptr<ConstantByte>> ByteConstants;
+  DenseMap<std::pair<ElementCount, APInt>, std::unique_ptr<ConstantByte>>
+      ByteSplatConstants;
+
   DenseMap<APFloat, std::unique_ptr<ConstantFP>> FPConstants;
   DenseMap<std::pair<ElementCount, APFloat>, std::unique_ptr<ConstantFP>>
       FPSplatConstants;
@@ -1743,11 +1749,15 @@ class LLVMContextImpl {
   ConstantInt *TheTrueVal = nullptr;
   ConstantInt *TheFalseVal = nullptr;
 
+  ConstantByte *TheTrueByteVal = nullptr;
+  ConstantByte *TheFalseByteVal = nullptr;
+
   // Basic type instances.
   Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
       TokenTy;
   Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_AMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
+  ByteType Byte1Ty, Byte8Ty, Byte16Ty, Byte32Ty, Byte64Ty, Byte128Ty;
 
   std::unique_ptr<ConstantTokenNone> TheNoneToken;
 
@@ -1756,6 +1766,7 @@ class LLVMContextImpl {
   SpecificBumpPtrAllocator<ConstantRangeAttributeImpl>
       ConstantRangeAttributeAlloc;
 
+  DenseMap<unsigned, ByteType *> ByteTypes;
   DenseMap<unsigned, IntegerType *> IntegerTypes;
 
   using FunctionTypeSet = DenseSet<FunctionType *, FunctionTypeKeyInfo>;
diff --git a/llvm/lib/IR/PatternMatch.cpp b/llvm/lib/IR/PatternMatch.cpp
new file mode 100644
index 0000000000000..334aedeb48e2f
--- /dev/null
+++ b/llvm/lib/IR/PatternMatch.cpp
@@ -0,0 +1,53 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Out-of-line implementations for PatternMatch.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+bool llvm::PatternMatch::undef_match::checkAggregate(
+    const ConstantAggregate *CA) {
+  SmallPtrSet<const ConstantAggregate *, 8> Seen;
+  SmallVector<const ConstantAggregate *, 8> Worklist;
+
+  // Either UndefValue, PoisonValue, or an aggregate that only contains
+  // these is accepted by matcher.
+  // CheckValue returns false if CA cannot satisfy this constraint.
+  auto CheckValue = [&](const ConstantAggregate *CA) {
+    for (const Value *Op : CA->operand_values()) {
+      if (isa<UndefValue>(Op))
+        continue;
+
+      const auto *CA = dyn_cast<ConstantAggregate>(Op);
+      if (!CA)
+        return false;
+      if (Seen.insert(CA).second)
+        Worklist.emplace_back(CA);
+    }
+
+    return true;
+  };
+
+  if (!CheckValue(CA))
+    return false;
+
+  while (!Worklist.empty()) {
+    if (!CheckValue(Worklist.pop_back_val()))
+      return false;
+  }
+  return true;
+}
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 299d07b81837b..498e78d4b0cc8 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -54,6 +54,10 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   }
 }
 
+bool Type::isByteTy(unsigned BitWidth) const {
+  return isByteTy() && cast<ByteType>(this)->getBitWidth() == BitWidth;
+}
+
 bool Type::isIntegerTy(unsigned Bitwidth) const {
   return isIntegerTy() && cast<IntegerType>(this)->getBitWidth() == Bitwidth;
 }
@@ -212,6 +216,8 @@ TypeSize Type::getPrimitiveSizeInBits() const {
     return TypeSize::getFixed(128);
   case Type::X86_AMXTyID:
     return TypeSize::getFixed(8192);
+  case Type::ByteTyID:
+    return TypeSize::getFixed(cast<ByteType>(this)->getBitWidth());
   case Type::IntegerTyID:
     return TypeSize::getFixed(cast<IntegerType>(this)->getBitWidth());
   case Type::FixedVectorTyID:
@@ -290,6 +296,17 @@ Type *Type::getFP128Ty(LLVMContext &C) { return &C.pImpl->FP128Ty; }
 Type *Type::getPPC_FP128Ty(LLVMContext &C) { return &C.pImpl->PPC_FP128Ty; }
 Type *Type::getX86_AMXTy(LLVMContext &C) { return &C.pImpl->X86_AMXTy; }
 
+ByteType *Type::getByte1Ty(LLVMContext &C) { return &C.pImpl->Byte1Ty; }
+ByteType *Type::getByte8Ty(LLVMContext &C) { return &C.pImpl->Byte8Ty; }
+ByteType *Type::getByte16Ty(LLVMContext &C) { return &C.pImpl->Byte16Ty; }
+ByteType *Type::getByte32Ty(LLVMContext &C) { return &C.pImpl->Byte32Ty; }
+ByteType *Type::getByte64Ty(LLVMContext &C) { return &C.pImpl->Byte64Ty; }
+ByteType *Type::getByte128Ty(LLVMContext &C) { return &C.pImpl->Byte128Ty; }
+
+ByteType *Type::getByteNTy(LLVMContext &C, unsigned N) {
+  return ByteType::get(C, N);
+}
+
 IntegerType *Type::getInt1Ty(LLVMContext &C) { return &C.pImpl->Int1Ty; }
 IntegerType *Type::getInt8Ty(LLVMContext &C) { return &C.pImpl->Int8Ty; }
 IntegerType *Type::getInt16Ty(LLVMContext &C) { return &C.pImpl->Int16Ty; }
@@ -301,6 +318,25 @@ IntegerType *Type::getIntNTy(LLVMContext &C, unsigned N) {
   return IntegerType::get(C, N);
 }
 
+Type *Type::getIntFromByteType(Type *Ty) {
+  assert(Ty->isByteOrByteVectorTy() && "Expected a byte or byte vector type.");
+  unsigned NumBits = Ty->getScalarSizeInBits();
+  IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+    return VectorType::get(IntTy, VecTy);
+  return IntTy;
+}
+
+Type *Type::getByteFromIntType(Type *Ty) {
+  assert(!Ty->isPtrOrPtrVectorTy() &&
+         "Expected a non-pointer or non-pointer vector type.");
+  unsigned NumBits = Ty->getScalarSizeInBits();
+  ByteType *ByteTy = ByteType::get(Ty->getContext(), NumBits);
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
+    return VectorType::get(ByteTy, VecTy);
+  return ByteTy;
+}
+
 Type *Type::getWasm_ExternrefTy(LLVMContext &C) {
   // opaque pointer in addrspace(10)
   return PointerType::get(C, 10);
@@ -341,6 +377,40 @@ IntegerType *IntegerType::get(LLVMContext &C, unsigned NumBits) {
 
 APInt IntegerType::getMask() const { return APInt::getAllOnes(getBitWidth()); }
 
+//===----------------------------------------------------------------------===//
+//                       ByteType Implementation
+//===----------------------------------------------------------------------===//
+
+ByteType *ByteType::get(LLVMContext &C, unsigned NumBits) {
+  assert(NumBits >= MIN_BYTE_BITS && "bitwidth too small");
+  assert(NumBits <= MAX_BYTE_BITS && "bitwidth too large");
+
+  // Check for the built-in byte types
+  switch (NumBits) {
+  case 8:
+    return Type::getByte8Ty(C);
+  case 16:
+    return Type::getByte16Ty(C);
+  case 32:
+    return Type::getByte32Ty(C);
+  case 64:
+    return Type::getByte64Ty(C);
+  case 128:
+    return Type::getByte128Ty(C);
+  default:
+    break;
+  }
+
+  ByteType *&Entry = C.pImpl->ByteTypes[NumBits];
+
+  if (!Entry)
+    Entry = new (C.pImpl->Alloc) ByteType(C, NumBits);
+
+  return Entry;
+}
+
+APInt ByteType::getMask() const { return APInt::getAllOnes(getBitWidth()); }
+
 //===----------------------------------------------------------------------===//
 //                       FunctionType Implementation
 //===----------------------------------------------------------------------===//
@@ -788,7 +858,8 @@ VectorType *VectorType::get(Type *ElementType, ElementCount EC) {
 
 bool VectorType::isValidElementType(Type *ElemTy) {
   if (ElemTy->isIntegerTy() || ElemTy->isFloatingPointTy() ||
-      ElemTy->isPointerTy() || ElemTy->getTypeID() == TypedPointerTyID)
+      ElemTy->isPointerTy() || ElemTy->getTypeID() == TypedPointerTyID ||
+      ElemTy->isByteTy())
     return true;
   if (auto *TTy = dyn_cast<TargetExtType>(ElemTy))
     return TTy->hasProperty(TargetExtType::CanBeVectorElement);
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3cdc75ca9869e..b64fe75e1e2d6 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -410,10 +410,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
     // pass manager to provide this as it isolates us from a potentially
     // out-of-date dominator tree and makes it significantly more complex to run
     // this code outside of a pass manager.
-    // FIXME: It's really gross that we have to cast away constness here.
-    if (!F.empty())
-      DT.recalculate(const_cast<Function &>(F));
 
+    // First check that every basic block has a terminator, otherwise we can't
+    // even inspect the CFG.
     for (const BasicBlock &BB : F) {
       if (!BB.empty() && BB.back().isTerminator())
         continue;
@@ -427,6 +426,10 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
       return false;
     }
 
+    // FIXME: It's really gross that we have to cast away constness here.
+    if (!F.empty())
+      DT.recalculate(const_cast<Function &>(F));
+
     auto FailureCB = [this](const Twine &Message) {
       this->CheckFailed(Message);
     };
@@ -4637,9 +4640,10 @@ void Verifier::visitLoadInst(LoadInst &LI) {
               LI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Load cannot have Release ordering", &LI);
     Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isByteTy() ||
               ElTy->getScalarType()->isFloatingPointTy(),
-          "atomic load operand must have integer, pointer, floating point, "
-          "or vector type!",
+          "atomic load operand must have integer, byte, pointer, floating "
+          "point, or vector type!",
           ElTy, &LI);
 
     checkAtomicMemAccessSize(ElTy, &LI);
@@ -4665,9 +4669,10 @@ void Verifier::visitStoreInst(StoreInst &SI) {
               SI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Store cannot have Acquire ordering", &SI);
     Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isByteTy() ||
               ElTy->getScalarType()->isFloatingPointTy(),
-          "atomic store operand must have integer, pointer, floating point, "
-          "or vector type!",
+          "atomic store operand must have integer, byte, pointer, floating "
+          "point, or vector type!",
           ElTy, &SI);
     checkAtomicMemAccessSize(ElTy, &SI);
   } else {
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 78e4c73ef0d2b..ea2380448c06f 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -127,6 +127,7 @@
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineLICM.h"
 #include "llvm/CodeGen/MachineLateInstrsCleanup.h"
 #include "llvm/CodeGen/MachinePassManager.h"
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 380be042df69d..a2c68233fd253 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -42,6 +42,9 @@ if( WIN32 )
   # advapi32 required for CryptAcquireContextW in lib/Support/Windows/Path.inc.
   # ntdll required for RtlGetLastNtStatus in lib/Support/ErrorHandling.cpp.
   set(system_libs ${system_libs} psapi shell32 ole32 uuid advapi32 ws2_32 ntdll)
+  if( HAVE_WINDOWS_ICU )
+    list(APPEND system_libs icu)
+  endif()
 elseif( CMAKE_HOST_UNIX )
   if( HAVE_LIBRT )
     set(system_libs ${system_libs} rt)
@@ -333,7 +336,7 @@ add_llvm_component_library(LLVMSupport
   )
 
 # Link ICU library if it is an external library.
-if(ICU_FOUND)
+if(ICU_FOUND AND NOT HAVE_WINDOWS_ICU)
   target_link_libraries(LLVMSupport
   PRIVATE
   ${ICU_LIBRARIES}
diff --git a/llvm/lib/Support/KnownFPClass.cpp b/llvm/lib/Support/KnownFPClass.cpp
index d758e748149f6..1a25e3866c4fd 100644
--- a/llvm/lib/Support/KnownFPClass.cpp
+++ b/llvm/lib/Support/KnownFPClass.cpp
@@ -252,7 +252,7 @@ static KnownFPClass fadd_impl(const KnownFPClass &KnownLHS,
     // This can't underflow if one of the operands is known normal.
     if (KnownLHS.isKnownNever(fcZero | fcPosSubnormal) ||
         KnownRHS.isKnownNever(fcZero | fcPosSubnormal))
-      Known.knownNot(fcZero);
+      Known.knownNot(fcZero | fcPosSubnormal);
   }
 
   if (KnownLHS.cannotBeOrderedGreaterThanZero() &&
@@ -262,7 +262,7 @@ static KnownFPClass fadd_impl(const KnownFPClass &KnownLHS,
     // This can't underflow if one of the operands is known normal.
     if (KnownLHS.isKnownNever(fcZero | fcNegSubnormal) ||
         KnownRHS.isKnownNever(fcZero | fcNegSubnormal))
-      Known.knownNot(fcZero);
+      Known.knownNot(fcZero | fcNegSubnormal);
   }
 
   return Known;
diff --git a/llvm/lib/Support/TextEncoding.cpp b/llvm/lib/Support/TextEncoding.cpp
index 453af6f7287bc..d36f02c1300b9 100644
--- a/llvm/lib/Support/TextEncoding.cpp
+++ b/llvm/lib/Support/TextEncoding.cpp
@@ -20,7 +20,11 @@
 #include <system_error>
 
 #if HAVE_ICU
+#if HAVE_WINDOWS_ICU
+#include <icu.h>
+#else
 #include <unicode/ucnv.h>
+#endif
 #elif HAVE_ICONV
 #include <iconv.h>
 #endif
diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index 6734877802caf..2ee104aa2be60 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -1840,14 +1840,14 @@ bool Scanner::fetchMoreTokens() {
   if (Column == 0 && *Current == '%')
     return scanDirective();
 
-  if (Column == 0 && Current + 4 <= End
+  if (Column == 0 && Current + 3 <= End
       && *Current == '-'
       && *(Current + 1) == '-'
       && *(Current + 2) == '-'
       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
     return scanDocumentIndicator(true);
 
-  if (Column == 0 && Current + 4 <= End
+  if (Column == 0 && Current + 3 <= End
       && *Current == '.'
       && *(Current + 1) == '.'
       && *(Current + 2) == '.'
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 6aa602cb2f6f3..54c7b6c7ac7e8 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -76,7 +76,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -123,7 +122,10 @@ class AArch64ConditionOptimizer : public MachineFunctionPass {
   MachineInstr *getBccTerminator(MachineBasicBlock *MBB);
   MachineInstr *findAdjustableCmp(MachineInstr *CondMI);
   CmpInfo getAdjustedCmpInfo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp);
-  void modifyCmp(MachineInstr *CmpMI, const CmpInfo &Info);
+  void updateCmpInstr(MachineInstr *CmpMI, int NewImm, unsigned NewOpc);
+  void updateCondInstr(MachineInstr *CondMI, AArch64CC::CondCode NewCC);
+  void applyCmpAdjustment(MachineInstr *CmpMI, MachineInstr *CondMI,
+                          const CmpInfo &Info);
   bool adjustTo(MachineInstr *CmpMI, AArch64CC::CondCode Cmp, MachineInstr *To,
                 int ToImm);
   bool optimizeIntraBlock(MachineBasicBlock &MBB);
@@ -354,54 +356,63 @@ CmpInfo AArch64ConditionOptimizer::getAdjustedCmpInfo(MachineInstr *CmpMI,
   return {NewImm, Opc, getAdjustedCmp(Cmp)};
 }
 
-// Applies changes to comparison instruction suggested by getAdjustedCmpInfo().
-void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
-                                          const CmpInfo &Info) {
-  MachineBasicBlock *const MBB = CmpMI->getParent();
-
-  // Change immediate in comparison instruction (ADDS or SUBS).
-  BuildMI(*MBB, CmpMI, CmpMI->getDebugLoc(), TII->get(Info.Opc))
-      .add(CmpMI->getOperand(0))
-      .add(CmpMI->getOperand(1))
-      .addImm(Info.Imm)
-      .add(CmpMI->getOperand(3));
-  CmpMI->eraseFromParent();
-
-  // The fact that this comparison was picked ensures that it's related to the
-  // first terminator instruction.
-  MachineInstr &BrMI = *MBB->getFirstTerminator();
-
-  // Change condition in branch instruction.
-  BuildMI(*MBB, BrMI, BrMI.getDebugLoc(), TII->get(AArch64::Bcc))
-      .addImm(Info.CC)
-      .add(BrMI.getOperand(1));
-  BrMI.eraseFromParent();
+// Modifies a comparison instruction's immediate and opcode.
+void AArch64ConditionOptimizer::updateCmpInstr(MachineInstr *CmpMI, int NewImm,
+                                               unsigned NewOpc) {
+  CmpMI->getOperand(2).setImm(NewImm);
+  CmpMI->setDesc(TII->get(NewOpc));
+}
 
+// Modifies the condition code of a conditional instruction.
+void AArch64ConditionOptimizer::updateCondInstr(MachineInstr *CondMI,
+                                                AArch64CC::CondCode NewCC) {
+  // Get the correct operand index for the conditional instruction
+  unsigned CondOpIdx;
+  switch (CondMI->getOpcode()) {
+  case AArch64::Bcc:
+    CondOpIdx = 0;
+    break;
+  case AArch64::CSINCWr:
+  case AArch64::CSINCXr:
+    CondOpIdx = 3;
+    break;
+  default:
+    llvm_unreachable("Unsupported conditional instruction");
+  }
+  CondMI->getOperand(CondOpIdx).setImm(NewCC);
   ++NumConditionsAdjusted;
 }
 
-// Parse a condition code returned by analyzeBranch, and compute the CondCode
-// corresponding to TBB.
-// Returns true if parsing was successful, otherwise false is returned.
-static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
+// Applies a comparison adjustment to a cmp/cond instruction pair.
+void AArch64ConditionOptimizer::applyCmpAdjustment(MachineInstr *CmpMI,
+                                                   MachineInstr *CondMI,
+                                                   const CmpInfo &Info) {
+  updateCmpInstr(CmpMI, Info.Imm, Info.Opc);
+  updateCondInstr(CondMI, Info.CC);
+}
+
+// Extracts the condition code from the result of analyzeBranch.
+// Returns the CondCode or Invalid if the format is not a simple br.cond.
+static AArch64CC::CondCode parseCondCode(ArrayRef<MachineOperand> Cond) {
+  assert(!Cond.empty() && "Expected non-empty condition from analyzeBranch");
   // A normal br.cond simply has the condition code.
   if (Cond[0].getImm() != -1) {
     assert(Cond.size() == 1 && "Unknown Cond array format");
-    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
-    return true;
+    return (AArch64CC::CondCode)(int)Cond[0].getImm();
   }
-  return false;
+  return AArch64CC::CondCode::Invalid;
 }
 
 // Adjusts one cmp instruction to another one if result of adjustment will allow
 // CSE.  Returns true if compare instruction was changed, otherwise false is
 // returned.
 bool AArch64ConditionOptimizer::adjustTo(MachineInstr *CmpMI,
-  AArch64CC::CondCode Cmp, MachineInstr *To, int ToImm)
-{
+                                         AArch64CC::CondCode Cmp,
+                                         MachineInstr *To, int ToImm) {
   CmpInfo Info = getAdjustedCmpInfo(CmpMI, Cmp);
   if (Info.Imm == ToImm && Info.Opc == To->getOpcode()) {
-    modifyCmp(CmpMI, Info);
+    MachineInstr &BrMI = *CmpMI->getParent()->getFirstTerminator();
+    applyCmpAdjustment(CmpMI, &BrMI, Info);
     return true;
   }
   return false;
@@ -530,9 +541,7 @@ bool AArch64ConditionOptimizer::optimizeIntraBlock(MachineBasicBlock &MBB) {
       LLVM_DEBUG(dbgs() << "Successfully optimizing intra-block CSINC pair\n");
 
       // Modify the selected CMP and CSINC
-      CmpToAdjust->getOperand(2).setImm(Adj.Imm);
-      CmpToAdjust->setDesc(TII->get(Adj.Opc));
-      CSINCToAdjust->getOperand(3).setImm(Adj.CC);
+      applyCmpAdjustment(CmpToAdjust, CSINCToAdjust, Adj);
 
       return true;
     }
@@ -577,13 +586,10 @@ bool AArch64ConditionOptimizer::optimizeCrossBlock(MachineBasicBlock &HBB) {
   if (!registersMatch(HeadCmpMI, TrueCmpMI))
     return false;
 
-  AArch64CC::CondCode HeadCmp;
-  if (HeadCond.empty() || !parseCond(HeadCond, HeadCmp)) {
-    return false;
-  }
-
-  AArch64CC::CondCode TrueCmp;
-  if (TrueCond.empty() || !parseCond(TrueCond, TrueCmp)) {
+  AArch64CC::CondCode HeadCmp = parseCondCode(HeadCond);
+  AArch64CC::CondCode TrueCmp = parseCondCode(TrueCond);
+  if (HeadCmp == AArch64CC::CondCode::Invalid ||
+      TrueCmp == AArch64CC::CondCode::Invalid) {
     return false;
   }
 
@@ -628,8 +634,8 @@ bool AArch64ConditionOptimizer::optimizeCrossBlock(MachineBasicBlock &HBB) {
     CmpInfo TrueCmpInfo = getAdjustedCmpInfo(TrueCmpMI, TrueCmp);
     if (HeadCmpInfo.Imm == TrueCmpInfo.Imm &&
         HeadCmpInfo.Opc == TrueCmpInfo.Opc) {
-      modifyCmp(HeadCmpMI, HeadCmpInfo);
-      modifyCmp(TrueCmpMI, TrueCmpInfo);
+      applyCmpAdjustment(HeadCmpMI, HeadBrMI, HeadCmpInfo);
+      applyCmpAdjustment(TrueCmpMI, TrueBrMI, TrueCmpInfo);
       return true;
     }
   } else if (((isGreaterThan(HeadCmp) && isGreaterThan(TrueCmp)) ||
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 058cae92de45b..06a88ba3da8dd 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -209,6 +209,17 @@ class SSACCmpConv {
 };
 } // end anonymous namespace
 
+static Register lookThroughCopies(Register Reg, MachineRegisterInfo *MRI) {
+  MachineInstr *MI;
+  while ((MI = MRI->getUniqueVRegDef(Reg)) &&
+         MI->getOpcode() == TargetOpcode::COPY) {
+    if (MI->getOperand(1).getReg().isPhysical())
+      break;
+    Reg = MI->getOperand(1).getReg();
+  }
+  return Reg;
+}
+
 // Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
 // This means that no if-conversion is required when merging CmpBB into Head.
 bool SSACCmpConv::trivialTailPHIs() {
@@ -219,7 +230,7 @@ bool SSACCmpConv::trivialTailPHIs() {
     // PHI operands come in (VReg, MBB) pairs.
     for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
       MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
-      Register Reg = I.getOperand(oi).getReg();
+      Register Reg = lookThroughCopies(I.getOperand(oi).getReg(), MRI);
       if (MBB == Head) {
         assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
         HeadReg = Reg;
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index aeb88f94b012f..4b3353e54797b 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1757,250 +1757,250 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     transferImpOps(MI, MIB, MIB);
     MI.eraseFromParent();
     return true;
-   }
-   case AArch64::IRGstack: {
-     MachineFunction &MF = *MBB.getParent();
-     const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
-     const AArch64FrameLowering *TFI =
-         MF.getSubtarget<AArch64Subtarget>().getFrameLowering();
-
-     // IRG does not allow immediate offset. getTaggedBasePointerOffset should
-     // almost always point to SP-after-prologue; if not, emit a longer
-     // instruction sequence.
-     int BaseOffset = -AFI->getTaggedBasePointerOffset();
-     Register FrameReg;
-     StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
-         MF, BaseOffset, false /*isFixed*/, TargetStackID::Default /*StackID*/,
-         FrameReg,
-         /*PreferFP=*/false,
-         /*ForSimm=*/true);
-     Register SrcReg = FrameReg;
-     if (FrameRegOffset) {
-       // Use output register as temporary.
-       SrcReg = MI.getOperand(0).getReg();
-       emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg,
-                       FrameRegOffset, TII);
-     }
-     BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::IRG))
-         .add(MI.getOperand(0))
-         .addUse(SrcReg)
-         .add(MI.getOperand(2));
-     MI.eraseFromParent();
-     return true;
-   }
-   case AArch64::TAGPstack: {
-     int64_t Offset = MI.getOperand(2).getImm();
-     BuildMI(MBB, MBBI, MI.getDebugLoc(),
-             TII->get(Offset >= 0 ? AArch64::ADDG : AArch64::SUBG))
-         .add(MI.getOperand(0))
-         .add(MI.getOperand(1))
-         .addImm(std::abs(Offset))
-         .add(MI.getOperand(4));
-     MI.eraseFromParent();
-     return true;
-   }
-   case AArch64::STGloop_wback:
-   case AArch64::STZGloop_wback:
-     return expandSetTagLoop(MBB, MBBI, NextMBBI);
-   case AArch64::STGloop:
-   case AArch64::STZGloop:
-     report_fatal_error(
-         "Non-writeback variants of STGloop / STZGloop should not "
-         "survive past PrologEpilogInserter.");
-   case AArch64::STR_ZZZZXI:
-   case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
-     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
-   case AArch64::STR_ZZZXI:
-     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
-   case AArch64::STR_ZZXI:
-   case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
-     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
-   case AArch64::STR_PPXI:
-     return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
-   case AArch64::LDR_ZZZZXI:
-   case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
-     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
-   case AArch64::LDR_ZZZXI:
-     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
-   case AArch64::LDR_ZZXI:
-   case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
-     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
-   case AArch64::LDR_PPXI:
-     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
-   case AArch64::BLR_RVMARKER:
-   case AArch64::BLRA_RVMARKER:
-     return expandCALL_RVMARKER(MBB, MBBI);
-   case AArch64::BLR_BTI:
-     return expandCALL_BTI(MBB, MBBI);
-   case AArch64::StoreSwiftAsyncContext:
-     return expandStoreSwiftAsyncContext(MBB, MBBI);
-   case AArch64::STSHH_ATOMIC_STORE_SZ:
-     return expandSTSHHAtomicStore(MBB, MBBI);
-   case AArch64::RestoreZAPseudo:
-   case AArch64::CommitZASavePseudo:
-   case AArch64::MSRpstatePseudo: {
-     auto *NewMBB = [&] {
-       switch (Opcode) {
-       case AArch64::RestoreZAPseudo:
-         return expandRestoreZASave(MBB, MBBI);
-       case AArch64::CommitZASavePseudo:
-         return expandCommitZASave(MBB, MBBI);
-       case AArch64::MSRpstatePseudo:
-         return expandCondSMToggle(MBB, MBBI);
-       default:
-         llvm_unreachable("Unexpected conditional pseudo!");
-       }
-     }();
-     if (NewMBB != &MBB)
-       NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
-     return true;
-   }
-   case AArch64::InOutZAUsePseudo:
-   case AArch64::RequiresZASavePseudo:
-   case AArch64::RequiresZT0SavePseudo:
-   case AArch64::SMEStateAllocPseudo:
-   case AArch64::COALESCER_BARRIER_FPR16:
-   case AArch64::COALESCER_BARRIER_FPR32:
-   case AArch64::COALESCER_BARRIER_FPR64:
-   case AArch64::COALESCER_BARRIER_FPR128:
-     MI.eraseFromParent();
-     return true;
-   case AArch64::LD1B_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LD1B_2Z_IMM, AArch64::LD1B_2Z_STRIDED_IMM);
-   case AArch64::LD1H_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LD1H_2Z_IMM, AArch64::LD1H_2Z_STRIDED_IMM);
-   case AArch64::LD1W_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LD1W_2Z_IMM, AArch64::LD1W_2Z_STRIDED_IMM);
-   case AArch64::LD1D_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LD1D_2Z_IMM, AArch64::LD1D_2Z_STRIDED_IMM);
-   case AArch64::LDNT1B_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1B_2Z_IMM, AArch64::LDNT1B_2Z_STRIDED_IMM);
-   case AArch64::LDNT1H_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1H_2Z_IMM, AArch64::LDNT1H_2Z_STRIDED_IMM);
-   case AArch64::LDNT1W_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1W_2Z_IMM, AArch64::LDNT1W_2Z_STRIDED_IMM);
-   case AArch64::LDNT1D_2Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1D_2Z_IMM, AArch64::LDNT1D_2Z_STRIDED_IMM);
-   case AArch64::LD1B_2Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
-                                 AArch64::ZPR2StridedRegClass, AArch64::LD1B_2Z,
-                                 AArch64::LD1B_2Z_STRIDED);
-   case AArch64::LD1H_2Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
-                                 AArch64::ZPR2StridedRegClass, AArch64::LD1H_2Z,
-                                 AArch64::LD1H_2Z_STRIDED);
-   case AArch64::LD1W_2Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
-                                 AArch64::ZPR2StridedRegClass, AArch64::LD1W_2Z,
-                                 AArch64::LD1W_2Z_STRIDED);
-   case AArch64::LD1D_2Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
-                                 AArch64::ZPR2StridedRegClass, AArch64::LD1D_2Z,
-                                 AArch64::LD1D_2Z_STRIDED);
-   case AArch64::LDNT1B_2Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1B_2Z, AArch64::LDNT1B_2Z_STRIDED);
-   case AArch64::LDNT1H_2Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1H_2Z, AArch64::LDNT1H_2Z_STRIDED);
-   case AArch64::LDNT1W_2Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1W_2Z, AArch64::LDNT1W_2Z_STRIDED);
-   case AArch64::LDNT1D_2Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
-         AArch64::LDNT1D_2Z, AArch64::LDNT1D_2Z_STRIDED);
-   case AArch64::LD1B_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LD1B_4Z_IMM, AArch64::LD1B_4Z_STRIDED_IMM);
-   case AArch64::LD1H_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LD1H_4Z_IMM, AArch64::LD1H_4Z_STRIDED_IMM);
-   case AArch64::LD1W_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LD1W_4Z_IMM, AArch64::LD1W_4Z_STRIDED_IMM);
-   case AArch64::LD1D_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LD1D_4Z_IMM, AArch64::LD1D_4Z_STRIDED_IMM);
-   case AArch64::LDNT1B_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1B_4Z_IMM, AArch64::LDNT1B_4Z_STRIDED_IMM);
-   case AArch64::LDNT1H_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1H_4Z_IMM, AArch64::LDNT1H_4Z_STRIDED_IMM);
-   case AArch64::LDNT1W_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1W_4Z_IMM, AArch64::LDNT1W_4Z_STRIDED_IMM);
-   case AArch64::LDNT1D_4Z_IMM_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1D_4Z_IMM, AArch64::LDNT1D_4Z_STRIDED_IMM);
-   case AArch64::LD1B_4Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
-                                 AArch64::ZPR4StridedRegClass, AArch64::LD1B_4Z,
-                                 AArch64::LD1B_4Z_STRIDED);
-   case AArch64::LD1H_4Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
-                                 AArch64::ZPR4StridedRegClass, AArch64::LD1H_4Z,
-                                 AArch64::LD1H_4Z_STRIDED);
-   case AArch64::LD1W_4Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
-                                 AArch64::ZPR4StridedRegClass, AArch64::LD1W_4Z,
-                                 AArch64::LD1W_4Z_STRIDED);
-   case AArch64::LD1D_4Z_PSEUDO:
-     return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
-                                 AArch64::ZPR4StridedRegClass, AArch64::LD1D_4Z,
-                                 AArch64::LD1D_4Z_STRIDED);
-   case AArch64::LDNT1B_4Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1B_4Z, AArch64::LDNT1B_4Z_STRIDED);
-   case AArch64::LDNT1H_4Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1H_4Z, AArch64::LDNT1H_4Z_STRIDED);
-   case AArch64::LDNT1W_4Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1W_4Z, AArch64::LDNT1W_4Z_STRIDED);
-   case AArch64::LDNT1D_4Z_PSEUDO:
-     return expandMultiVecPseudo(
-         MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
-         AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
-   case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
-     return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
-   case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
-     return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
-   case AArch64::EON_ZZZ:
-   case AArch64::NAND_ZZZ:
-   case AArch64::NOR_ZZZ:
-     return expandSVEBitwisePseudo(MI, MBB, MBBI);
+  }
+  case AArch64::IRGstack: {
+    MachineFunction &MF = *MBB.getParent();
+    const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    const AArch64FrameLowering *TFI =
+        MF.getSubtarget<AArch64Subtarget>().getFrameLowering();
+
+    // IRG does not allow immediate offset. getTaggedBasePointerOffset should
+    // almost always point to SP-after-prologue; if not, emit a longer
+    // instruction sequence.
+    int BaseOffset = -AFI->getTaggedBasePointerOffset();
+    Register FrameReg;
+    StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
+        MF, BaseOffset, false /*isFixed*/, TargetStackID::Default /*StackID*/,
+        FrameReg,
+        /*PreferFP=*/false,
+        /*ForSimm=*/true);
+    Register SrcReg = FrameReg;
+    if (FrameRegOffset) {
+      // Use output register as temporary.
+      SrcReg = MI.getOperand(0).getReg();
+      emitFrameOffset(MBB, &MI, MI.getDebugLoc(), SrcReg, FrameReg,
+                      FrameRegOffset, TII);
+    }
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::IRG))
+        .add(MI.getOperand(0))
+        .addUse(SrcReg)
+        .add(MI.getOperand(2));
+    MI.eraseFromParent();
+    return true;
+  }
+  case AArch64::TAGPstack: {
+    int64_t Offset = MI.getOperand(2).getImm();
+    BuildMI(MBB, MBBI, MI.getDebugLoc(),
+            TII->get(Offset >= 0 ? AArch64::ADDG : AArch64::SUBG))
+        .add(MI.getOperand(0))
+        .add(MI.getOperand(1))
+        .addImm(std::abs(Offset))
+        .add(MI.getOperand(4));
+    MI.eraseFromParent();
+    return true;
+  }
+  case AArch64::STGloop_wback:
+  case AArch64::STZGloop_wback:
+    return expandSetTagLoop(MBB, MBBI, NextMBBI);
+  case AArch64::STGloop:
+  case AArch64::STZGloop:
+    report_fatal_error(
+        "Non-writeback variants of STGloop / STZGloop should not "
+        "survive past PrologEpilogInserter.");
+  case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
+    return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
+  case AArch64::STR_ZZZXI:
+    return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
+  case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
+    return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
+  case AArch64::STR_PPXI:
+    return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
+  case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
+    return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
+  case AArch64::LDR_ZZZXI:
+    return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
+  case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
+    return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
+  case AArch64::LDR_PPXI:
+    return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
+  case AArch64::BLR_RVMARKER:
+  case AArch64::BLRA_RVMARKER:
+    return expandCALL_RVMARKER(MBB, MBBI);
+  case AArch64::BLR_BTI:
+    return expandCALL_BTI(MBB, MBBI);
+  case AArch64::StoreSwiftAsyncContext:
+    return expandStoreSwiftAsyncContext(MBB, MBBI);
+  case AArch64::STSHH_ATOMIC_STORE_SZ:
+    return expandSTSHHAtomicStore(MBB, MBBI);
+  case AArch64::RestoreZAPseudo:
+  case AArch64::CommitZASavePseudo:
+  case AArch64::MSRpstatePseudo: {
+    auto *NewMBB = [&] {
+      switch (Opcode) {
+      case AArch64::RestoreZAPseudo:
+        return expandRestoreZASave(MBB, MBBI);
+      case AArch64::CommitZASavePseudo:
+        return expandCommitZASave(MBB, MBBI);
+      case AArch64::MSRpstatePseudo:
+        return expandCondSMToggle(MBB, MBBI);
+      default:
+        llvm_unreachable("Unexpected conditional pseudo!");
+      }
+    }();
+    if (NewMBB != &MBB)
+      NextMBBI = MBB.end(); // The NextMBBI iterator is invalidated.
+    return true;
+  }
+  case AArch64::InOutZAUsePseudo:
+  case AArch64::RequiresZASavePseudo:
+  case AArch64::RequiresZT0SavePseudo:
+  case AArch64::SMEStateAllocPseudo:
+  case AArch64::COALESCER_BARRIER_FPR16:
+  case AArch64::COALESCER_BARRIER_FPR32:
+  case AArch64::COALESCER_BARRIER_FPR64:
+  case AArch64::COALESCER_BARRIER_FPR128:
+    MI.eraseFromParent();
+    return true;
+  case AArch64::LD1B_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LD1B_2Z_IMM, AArch64::LD1B_2Z_STRIDED_IMM);
+  case AArch64::LD1H_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LD1H_2Z_IMM, AArch64::LD1H_2Z_STRIDED_IMM);
+  case AArch64::LD1W_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LD1W_2Z_IMM, AArch64::LD1W_2Z_STRIDED_IMM);
+  case AArch64::LD1D_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LD1D_2Z_IMM, AArch64::LD1D_2Z_STRIDED_IMM);
+  case AArch64::LDNT1B_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LDNT1B_2Z_IMM, AArch64::LDNT1B_2Z_STRIDED_IMM);
+  case AArch64::LDNT1H_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LDNT1H_2Z_IMM, AArch64::LDNT1H_2Z_STRIDED_IMM);
+  case AArch64::LDNT1W_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LDNT1W_2Z_IMM, AArch64::LDNT1W_2Z_STRIDED_IMM);
+  case AArch64::LDNT1D_2Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass,
+        AArch64::LDNT1D_2Z_IMM, AArch64::LDNT1D_2Z_STRIDED_IMM);
+  case AArch64::LD1B_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass, AArch64::LD1B_2Z,
+                                AArch64::LD1B_2Z_STRIDED);
+  case AArch64::LD1H_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass, AArch64::LD1H_2Z,
+                                AArch64::LD1H_2Z_STRIDED);
+  case AArch64::LD1W_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass, AArch64::LD1W_2Z,
+                                AArch64::LD1W_2Z_STRIDED);
+  case AArch64::LD1D_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass, AArch64::LD1D_2Z,
+                                AArch64::LD1D_2Z_STRIDED);
+  case AArch64::LDNT1B_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass,
+                                AArch64::LDNT1B_2Z, AArch64::LDNT1B_2Z_STRIDED);
+  case AArch64::LDNT1H_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass,
+                                AArch64::LDNT1H_2Z, AArch64::LDNT1H_2Z_STRIDED);
+  case AArch64::LDNT1W_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass,
+                                AArch64::LDNT1W_2Z, AArch64::LDNT1W_2Z_STRIDED);
+  case AArch64::LDNT1D_2Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass,
+                                AArch64::ZPR2StridedRegClass,
+                                AArch64::LDNT1D_2Z, AArch64::LDNT1D_2Z_STRIDED);
+  case AArch64::LD1B_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LD1B_4Z_IMM, AArch64::LD1B_4Z_STRIDED_IMM);
+  case AArch64::LD1H_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LD1H_4Z_IMM, AArch64::LD1H_4Z_STRIDED_IMM);
+  case AArch64::LD1W_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LD1W_4Z_IMM, AArch64::LD1W_4Z_STRIDED_IMM);
+  case AArch64::LD1D_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LD1D_4Z_IMM, AArch64::LD1D_4Z_STRIDED_IMM);
+  case AArch64::LDNT1B_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LDNT1B_4Z_IMM, AArch64::LDNT1B_4Z_STRIDED_IMM);
+  case AArch64::LDNT1H_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LDNT1H_4Z_IMM, AArch64::LDNT1H_4Z_STRIDED_IMM);
+  case AArch64::LDNT1W_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LDNT1W_4Z_IMM, AArch64::LDNT1W_4Z_STRIDED_IMM);
+  case AArch64::LDNT1D_4Z_IMM_PSEUDO:
+    return expandMultiVecPseudo(
+        MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass,
+        AArch64::LDNT1D_4Z_IMM, AArch64::LDNT1D_4Z_STRIDED_IMM);
+  case AArch64::LD1B_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass, AArch64::LD1B_4Z,
+                                AArch64::LD1B_4Z_STRIDED);
+  case AArch64::LD1H_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass, AArch64::LD1H_4Z,
+                                AArch64::LD1H_4Z_STRIDED);
+  case AArch64::LD1W_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass, AArch64::LD1W_4Z,
+                                AArch64::LD1W_4Z_STRIDED);
+  case AArch64::LD1D_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass, AArch64::LD1D_4Z,
+                                AArch64::LD1D_4Z_STRIDED);
+  case AArch64::LDNT1B_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass,
+                                AArch64::LDNT1B_4Z, AArch64::LDNT1B_4Z_STRIDED);
+  case AArch64::LDNT1H_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass,
+                                AArch64::LDNT1H_4Z, AArch64::LDNT1H_4Z_STRIDED);
+  case AArch64::LDNT1W_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass,
+                                AArch64::LDNT1W_4Z, AArch64::LDNT1W_4Z_STRIDED);
+  case AArch64::LDNT1D_4Z_PSEUDO:
+    return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass,
+                                AArch64::ZPR4StridedRegClass,
+                                AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED);
+  case AArch64::FORM_TRANSPOSED_REG_TUPLE_X2_PSEUDO:
+    return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 2);
+  case AArch64::FORM_TRANSPOSED_REG_TUPLE_X4_PSEUDO:
+    return expandFormTuplePseudo(MBB, MBBI, NextMBBI, 4);
+  case AArch64::EON_ZZZ:
+  case AArch64::NAND_ZZZ:
+  case AArch64::NOR_ZZZ:
+    return expandSVEBitwisePseudo(MI, MBB, MBBI);
   }
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8deb0a33f27d1..cd5a42da82082 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1568,6 +1568,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom);
     }
     for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
+      setOperationAction({ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON}, VT,
+                         Custom);
       setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Legal);
       setOperationAction(ISD::GET_ACTIVE_LANE_MASK, VT, Legal);
     }
@@ -2004,7 +2006,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::VSCALE, MVT::i32, Custom);
 
     for (auto VT : {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1})
-      setOperationAction(ISD::INTRINSIC_WO_CHAIN, VT, Custom);
+      setOperationAction(
+          {ISD::INTRINSIC_WO_CHAIN, ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON},
+          VT, Custom);
   }
 
   // Handle partial reduction operations
@@ -7018,24 +7022,6 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         ADDLV, DAG.getConstant(0, DL, MVT::i64));
     return EXTRACT_VEC_ELT;
   }
-  case Intrinsic::experimental_cttz_elts: {
-    SDValue CttzOp = Op.getOperand(1);
-    EVT VT = CttzOp.getValueType();
-    assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
-
-    if (VT.isFixedLengthVector()) {
-      // We can use SVE instructions to lower this intrinsic by first creating
-      // an SVE predicate register mask from the fixed-width vector.
-      VT = getTypeToTransformTo(*DAG.getContext(), VT);
-      SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, CttzOp);
-      CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
-    }
-
-    SDValue Pg = getPredicateForVector(DAG, DL, VT);
-    SDValue NewCttzElts =
-        DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, Pg, CttzOp);
-    return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
-  }
   case Intrinsic::experimental_vector_match: {
     return LowerVectorMatch(Op, DAG);
   }
@@ -8498,6 +8484,26 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerCLMUL(Op, DAG);
   case ISD::FCANONICALIZE:
     return LowerFCANONICALIZE(Op, DAG);
+  case ISD::CTTZ_ELTS:
+  case ISD::CTTZ_ELTS_ZERO_POISON: {
+    SDLoc DL(Op);
+    SDValue CttzOp = Op.getOperand(0);
+    EVT VT = CttzOp.getValueType();
+    assert(VT.getVectorElementType() == MVT::i1 && "Expected MVT::i1");
+
+    if (VT.isFixedLengthVector()) {
+      // We can use SVE instructions to lower this intrinsic by first creating
+      // an SVE predicate register mask from the fixed-width vector.
+      VT = getTypeToTransformTo(*DAG.getContext(), VT);
+      SDValue Mask = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, CttzOp);
+      CttzOp = convertFixedMaskToScalableVector(Mask, DAG);
+    }
+
+    SDValue Pg = getPredicateForVector(DAG, DL, VT);
+    SDValue NewCttzElts =
+        DAG.getNode(AArch64ISD::CTTZ_ELTS, DL, MVT::i64, Pg, CttzOp);
+    return DAG.getZExtOrTrunc(NewCttzElts, DL, Op.getValueType());
+  }
   }
 }
 
@@ -21998,11 +22004,16 @@ performInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   EVT VecVT = Vec.getValueType();
   EVT SubVT = SubVec.getValueType();
 
-  // Promote fixed length vector zeros.
+  // Promote fixed length vector constants.
   if (VecVT.isScalableVector() && SubVT.isFixedLengthVector() &&
-      Vec.isUndef() && isZerosVector(SubVec.getNode()))
-    return VecVT.isInteger() ? DAG.getConstant(0, DL, VecVT)
-                             : DAG.getConstantFP(0, DL, VecVT);
+      Vec.isUndef()) {
+    SDValue SplatVal = DAG.getSplatValue(SubVec);
+    if (auto C = dyn_cast_or_null<ConstantSDNode>(SplatVal))
+      return DAG.getConstant(C->getAPIntValue(), DL, VecVT);
+
+    if (auto C = dyn_cast_or_null<ConstantFPSDNode>(SplatVal))
+      return DAG.getConstantFP(C->getValueAPF(), DL, VecVT);
+  }
 
   // Only do this for legal fixed vector types.
   if (!VecVT.isFixedLengthVector() ||
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 926593022b537..fe6b5b3d0e51f 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4771,6 +4771,11 @@ let Predicates = [HasSVE2p3_or_SME2p3] in {
   def SDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b0, "sdot", ZPR16, ZPR8>;
   def UDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b1, "udot", ZPR16, ZPR8>;
 
+  def : Pat<(nxv8i16 (partial_reduce_umla nxv8i16:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+            (UDOT_ZZZ_BtoH $Acc, $MulLHS, $MulRHS)>;
+  def : Pat<(nxv8i16 (partial_reduce_smla nxv8i16:$Acc, nxv16i8:$MulLHS, nxv16i8:$MulRHS)),
+            (SDOT_ZZZ_BtoH $Acc, $MulLHS, $MulRHS)>;
+
   // SVE2 integer indexed dot product
   def SDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b0, "sdot">;
   def UDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b1, "udot">;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 080613e17b90f..50671e822b6b2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -941,7 +941,7 @@ void AArch64PassConfig::addPostBBSections() {
 void AArch64PassConfig::addPreEmitPass2() {
   // SVE bundles move prefixes with destructive operations. BLR_RVMARKER pseudo
   // instructions are lowered to bundles as well.
-  addPass(createUnpackMachineBundles(nullptr));
+  addPass(createUnpackMachineBundlesLegacy(nullptr));
 }
 
 bool AArch64PassConfig::addRegAssignAndRewriteOptimized() {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 8cae8a556cabc..107ab9fdf3f9c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4311,8 +4311,8 @@ std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
     return std::nullopt;
   if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
     return std::nullopt;
-  if (CanUseSVE && Ty->isScalableTy() && ST->hasSVEB16B16() &&
-      ST->isNonStreamingSVEorSME2Available())
+  // If we have +sve-b16b16 the operation can be promoted to SVE.
+  if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
     return std::nullopt;
 
   Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 98c0909b44ea1..47d12af3e1bdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -298,9 +298,8 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
 };
 } // anonymous namespace
 
-AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
-  : CallLowering(&TLI) {
-}
+AMDGPUCallLowering::AMDGPUCallLowering(const TargetLowering &TLI)
+    : CallLowering(&TLI) {}
 
 // FIXME: Compatibility shim
 static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index e0033d59d10bb..239300c1469b9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -18,7 +18,6 @@
 
 namespace llvm {
 
-class AMDGPUTargetLowering;
 class GCNSubtarget;
 class MachineInstrBuilder;
 class SIMachineFunctionInfo;
@@ -41,7 +40,7 @@ class AMDGPUCallLowering final : public CallLowering {
                                MachineInstrBuilder &Ret) const;
 
 public:
-  AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
+  AMDGPUCallLowering(const TargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &B, const Value *Val,
                    ArrayRef<Register> VRegs,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index b72bfaa3e7d10..714958c79f609 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -177,9 +177,9 @@ void DivergenceLoweringHelper::buildMergeLaneMasks(
   Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
 
   B.setInsertPt(MBB, I);
-  B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
-  B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
-  B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
+  B.buildInstr(LMC->AndN2Opc, {PrevMaskedReg}, {PrevRegCopy, LMC->ExecReg});
+  B.buildInstr(LMC->AndOpc, {CurMaskedReg}, {LMC->ExecReg, CurRegCopy});
+  B.buildInstr(LMC->OrOpc, {DstReg}, {PrevMaskedReg, CurMaskedReg});
 }
 
 // GlobalISel has to constrain S1 incoming taken as-is with lane mask register
@@ -222,7 +222,7 @@ bool DivergenceLoweringHelper::lowerTemporalDivergence() {
 
     Register VgprReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
     B.buildInstr(AMDGPU::COPY, {VgprReg}, {Reg})
-        .addUse(ExecReg, RegState::Implicit);
+        .addUse(LMC->ExecReg, RegState::Implicit);
 
     replaceUsesOfRegInInstWith(Reg, UseInst, VgprReg);
     TDCache[Reg] = VgprReg;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 492c9873a3692..a5f53ce2f15a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -57,6 +57,8 @@ enum ImplicitArgOffsets {
   HIDDEN_REMAINDER_X = 18,
   HIDDEN_REMAINDER_Y = 20,
   HIDDEN_REMAINDER_Z = 22,
+
+  GRID_DIMS = 64
 };
 
 class AMDGPULowerKernelAttributes : public ModulePass {
@@ -116,6 +118,45 @@ static bool annotateGroupSizeLoadWithRangeMD(LoadInst *Load, bool IsRemainder) {
   return true;
 }
 
+static bool annotateGridDimsLoadWithRangeMD(LoadInst *Load,
+                                            unsigned KnownNumGridDims) {
+  IntegerType *Ty = dyn_cast<IntegerType>(Load->getType());
+  if (!Ty || Ty->getBitWidth() < 3)
+    return false;
+
+  if (KnownNumGridDims != 0) {
+    Load->replaceAllUsesWith(
+        ConstantInt::get(Load->getType(), KnownNumGridDims));
+    return true;
+  }
+
+  // TODO: If there is existing range metadata, preserve it if it is stricter.
+  if (Load->hasMetadata(LLVMContext::MD_range))
+    return false;
+
+  MDBuilder MDB(Load->getContext());
+  MDNode *Range =
+      MDB.createRange(APInt(Ty->getBitWidth(), 1), APInt(Ty->getBitWidth(), 4));
+  Load->setMetadata(LLVMContext::MD_range, Range);
+  return true;
+}
+
+/// Compute the number of grid dimensions based on !reqd_work_group_size
+/// metadata
+static unsigned computeNumGridDims(const MDNode *ReqdWorkGroupSize) {
+  ConstantInt *KnownZ =
+      mdconst::extract<ConstantInt>(ReqdWorkGroupSize->getOperand(2));
+  if (KnownZ->getZExtValue() != 1)
+    return 3;
+
+  ConstantInt *KnownY =
+      mdconst::extract<ConstantInt>(ReqdWorkGroupSize->getOperand(1));
+  if (KnownY->getZExtValue() != 1)
+    return 2;
+
+  return 1;
+}
+
 static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   Function *F = CI->getFunction();
 
@@ -137,6 +178,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
   const DataLayout &DL = F->getDataLayout();
   bool MadeChange = false;
 
+  unsigned KnownNumGridDims = HasReqdWorkGroupSize ? computeNumGridDims(MD) : 0;
+
   // We expect to see several GEP users, casted to the appropriate type and
   // loaded.
   for (User *U : CI->users()) {
@@ -224,6 +267,11 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
           MadeChange |= annotateGroupSizeLoadWithRangeMD(Load, true);
         }
         break;
+
+      case GRID_DIMS:
+        if (LoadSize <= 2)
+          MadeChange |= annotateGridDimsLoadWithRangeMD(Load, KnownNumGridDims);
+        break;
       default:
         break;
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
index 3504c52884819..55a1c58f2bc30 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp
@@ -832,6 +832,64 @@ bool RegBankLegalizeHelper::lowerSplitTo32SExtInReg(MachineInstr &MI) {
   return true;
 }
 
+bool RegBankLegalizeHelper::lowerSplitBitCount64To32(MachineInstr &MI) {
+  // Split 64-bit find-first-bit operations into 32-bit halves:
+  //   (ffbh hi:lo)            -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
+  //   (ffbl hi:lo)            -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
+  //   (ctlz_zero_undef hi:lo) -> umin(ffbh(hi), add(ffbh(lo), 32))
+  //   (cttz_zero_undef hi:lo) -> umin(ffbl(lo), add(ffbl(hi), 32))
+  unsigned Opc = MI.getOpcode();
+
+  // FFBH/FFBL return 0xFFFFFFFF on zero input, using uaddsat to avoid
+  // wrapping. CTLZ/CTTZ guarantee non-zero input (zero_undef), so plain add
+  // is fine.
+  unsigned FFBOpc;
+  unsigned AddOpc;
+  bool SearchFromMSB;
+  switch (Opc) {
+  case AMDGPU::G_AMDGPU_FFBH_U32:
+    FFBOpc = Opc;
+    AddOpc = AMDGPU::G_UADDSAT;
+    SearchFromMSB = true;
+    break;
+  case AMDGPU::G_AMDGPU_FFBL_B32:
+    FFBOpc = Opc;
+    AddOpc = AMDGPU::G_UADDSAT;
+    SearchFromMSB = false;
+    break;
+  case AMDGPU::G_CTLZ_ZERO_UNDEF:
+    FFBOpc = AMDGPU::G_AMDGPU_FFBH_U32;
+    AddOpc = AMDGPU::G_ADD;
+    SearchFromMSB = true;
+    break;
+  case AMDGPU::G_CTTZ_ZERO_UNDEF:
+    FFBOpc = AMDGPU::G_AMDGPU_FFBL_B32;
+    AddOpc = AMDGPU::G_ADD;
+    SearchFromMSB = false;
+    break;
+  default:
+    llvm_unreachable("unexpected opcode in lowerSplitBitCount64To32");
+  }
+
+  auto Unmerge = B.buildUnmerge(VgprRB_S32, MI.getOperand(1).getReg());
+  Register Lo = Unmerge.getReg(0);
+  Register Hi = Unmerge.getReg(1);
+
+  // MSB-first (FFBH/CTLZ) searches hi first; LSB-first (FFBL/CTTZ) searches
+  // lo first. The secondary half adds 32 to account for the primary half's
+  // width.
+  auto Primary = B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Hi : Lo});
+  auto Secondary =
+      B.buildInstr(FFBOpc, {VgprRB_S32}, {SearchFromMSB ? Lo : Hi});
+
+  auto Adjusted = B.buildInstr(AddOpc, {VgprRB_S32},
+                               {Secondary, B.buildConstant(VgprRB_S32, 32)});
+  B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool RegBankLegalizeHelper::lower(MachineInstr &MI,
                                   const RegBankLLTMapping &Mapping,
                                   WaterfallInfo &WFI) {
@@ -1101,27 +1159,8 @@ bool RegBankLegalizeHelper::lower(MachineInstr &MI,
   }
   case ApplyINTRIN_IMAGE:
     return applyRegisterBanksINTRIN_IMAGE(MI);
-  case SplitFFB64To32: {
-    // (ffbh hi:lo) -> umin(ffbh(hi), uaddsat(ffbh(lo), 32))
-    // (ffbl hi:lo) -> umin(ffbl(lo), uaddsat(ffbl(hi), 32))
-    auto Unmerge = B.buildUnmerge({VgprRB, S32}, MI.getOperand(1).getReg());
-    unsigned Opc = MI.getOpcode();
-    auto Lo = B.buildInstr(Opc, {{VgprRB, S32}}, {Unmerge.getReg(0)});
-    auto Hi = B.buildInstr(Opc, {{VgprRB, S32}}, {Unmerge.getReg(1)});
-
-    // FFBH counts from MSB, FFBL counts from LSB. The secondary half adds 32 to
-    // account for the primary half's width.
-    bool IsFFBH = Opc == AMDGPU::G_AMDGPU_FFBH_U32;
-    auto Primary = IsFFBH ? Hi : Lo;
-    auto Secondary = IsFFBH ? Lo : Hi;
-    auto Adjusted =
-        B.buildInstr(AMDGPU::G_UADDSAT, {{VgprRB, S32}},
-                     {Secondary, B.buildConstant({VgprRB, S32}, 32)});
-    B.buildUMin(MI.getOperand(0).getReg(), Primary, Adjusted);
-
-    MI.eraseFromParent();
-    return true;
-  }
+  case SplitBitCount64To32:
+    return lowerSplitBitCount64To32(MI);
   }
 
   if (!WFI.SgprWaterfallOperandRegs.empty()) {
@@ -1192,6 +1231,8 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
     return LLT::fixed_vector(2, 32);
   case VgprV3S32:
     return LLT::fixed_vector(3, 32);
+  case VgprV4S16:
+    return LLT::fixed_vector(4, 16);
   case SgprV4S32:
   case SgprV4S32_WF:
   case VgprV4S32:
@@ -1358,6 +1399,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
   case VgprV2S32:
   case VgprV2S64:
   case VgprV3S32:
+  case VgprV4S16:
   case VgprV4S32:
   case VgprV8S32:
   case VgprB32:
@@ -1419,6 +1461,7 @@ bool RegBankLegalizeHelper::applyMappingDst(
     case VgprV2S32:
     case VgprV2S64:
     case VgprV3S32:
+    case VgprV4S16:
     case VgprV4S32:
     case VgprV8S32: {
       assert(Ty == getTyFromID(MethodIDs[OpIdx]));
@@ -1603,6 +1646,7 @@ bool RegBankLegalizeHelper::applyMappingSrc(
     case VgprV2S32:
     case VgprV2S64:
     case VgprV3S32:
+    case VgprV4S16:
     case VgprV4S32:
     case VgprV8S32: {
       assert(Ty == getTyFromID(MethodIDs[i]));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
index e3c3079fe9552..577c26e4bf02a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h
@@ -137,6 +137,7 @@ class RegBankLegalizeHelper {
   bool lowerSplitTo16(MachineInstr &MI);
   bool lowerSplitTo32Select(MachineInstr &MI);
   bool lowerSplitTo32SExtInReg(MachineInstr &MI);
+  bool lowerSplitBitCount64To32(MachineInstr &MI);
   bool lowerUnpackMinMax(MachineInstr &MI);
   bool lowerUnpackAExt(MachineInstr &MI);
   bool applyRegisterBanksINTRIN_IMAGE(MachineInstr &MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 22fe74e77a8ae..da52e3d2f851d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -189,6 +189,10 @@ bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID,
     return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
   case DivV2S32:
     return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
+  case DivV3S32:
+    return MRI.getType(Reg) == LLT::fixed_vector(3, 32) && MUI.isDivergent(Reg);
+  case DivV4S16:
+    return MRI.getType(Reg) == LLT::fixed_vector(4, 16) && MUI.isDivergent(Reg);
   case DivB32:
     return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
   case DivB64:
@@ -799,7 +803,7 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
                     G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
                     G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
                     G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
-                    G_ATOMICRMW_UDEC_WRAP})
+                    G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
       .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
       .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
       .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
@@ -846,7 +850,8 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
                     G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_SMAX,
-                    G_AMDGPU_BUFFER_ATOMIC_SMIN},
+                    G_AMDGPU_BUFFER_ATOMIC_SMIN, G_AMDGPU_BUFFER_ATOMIC_FMAX,
+                    G_AMDGPU_BUFFER_ATOMIC_FMIN},
                    Standard)
       .Div(S32, {{Vgpr32}, {Vgpr32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}})
       .Div(S64, {{Vgpr64}, {Vgpr64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}});
@@ -1184,11 +1189,12 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S64, {{Sgpr64}, {Sgpr64}})
       .Div(S64, {{Vgpr64}, {Vgpr64}});
 
-  addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32})
+  addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_UNDEF,
+                    G_CTTZ_ZERO_UNDEF})
       .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
       .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
       .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
-      .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}, SplitFFB64To32}});
+      .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}, SplitBitCount64To32}});
 
   addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
 
@@ -1432,7 +1438,30 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
   addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId, Imm}}});
 
   addRulesForIOpcs({amdgcn_groupstaticsize}).Any({{S32}, {{Sgpr32}, {IntrId}}});
-  addRulesForIOpcs({amdgcn_endpgm}).Any({{}, {{}, {}}});
+
+  // Intrinsics with no register operands.
+  addRulesForIOpcs({amdgcn_endpgm,
+                    amdgcn_s_barrier,
+                    amdgcn_s_barrier_signal,
+                    amdgcn_s_barrier_wait,
+                    amdgcn_s_nop,
+                    amdgcn_s_sethalt,
+                    amdgcn_s_setprio,
+                    amdgcn_s_sleep,
+                    amdgcn_s_wait_asynccnt,
+                    amdgcn_s_wait_bvhcnt,
+                    amdgcn_s_wait_dscnt,
+                    amdgcn_s_wait_event,
+                    amdgcn_s_wait_event_export_ready,
+                    amdgcn_s_wait_expcnt,
+                    amdgcn_s_wait_kmcnt,
+                    amdgcn_s_wait_loadcnt,
+                    amdgcn_s_wait_samplecnt,
+                    amdgcn_s_wait_storecnt,
+                    amdgcn_s_wait_tensorcnt,
+                    amdgcn_s_waitcnt,
+                    amdgcn_wave_barrier})
+      .Any({{}, {{}, {}}});
 
   // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
   addRulesForIOpcs({amdgcn_end_cf})
@@ -1465,11 +1494,6 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       // readfirstlaning just in case register is not in sgpr.
       .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
 
-  addRulesForIOpcs({amdgcn_s_setprio, amdgcn_s_sethalt, amdgcn_s_nop})
-      .Any({{}, {{}, {IntrId, Imm}}});
-
-  addRulesForIOpcs({amdgcn_s_sleep}).Any({{_, _}, {{}, {IntrId, Imm}}});
-
   addRulesForIOpcs({amdgcn_bitop3}, Standard)
       .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
       .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
@@ -1550,7 +1574,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
   addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_lds})
       .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
 
-  addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm}, StandardB)
+  addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm, amdgcn_wqm, amdgcn_softwqm,
+                    amdgcn_strict_wqm},
+                   StandardB)
       .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
       .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
       .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
@@ -1584,4 +1610,13 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
       .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
       .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
 
+  addRulesForIOpcs({amdgcn_ds_read_tr4_b64, amdgcn_ds_read_tr8_b64})
+      .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
+
+  addRulesForIOpcs({amdgcn_ds_read_tr6_b96})
+      .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
+
+  addRulesForIOpcs({amdgcn_ds_read_tr16_b64})
+      .Any({{DivV4S16}, {{VgprV4S16}, {IntrId, VgprP3}}});
+
 } // end initialize rules
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
index 8a6f487b026c7..ce126af4b4f68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h
@@ -105,6 +105,8 @@ enum UniformityLLTOpPredicateID {
   DivV2S16,
   DivV2S32,
   DivV2S64,
+  DivV3S32,
+  DivV4S16,
 
   // B types
   B32,
@@ -196,6 +198,7 @@ enum RegBankLLTMappingApplyID {
   VgprB256,
   VgprB512,
   VgprBRC,
+  VgprV4S16,
   VgprV4S32,
   VgprV8S32,
   VgprV2S64,
@@ -276,7 +279,7 @@ enum LoweringMethodID {
   VerifyAllSgprGPHI,
   VerifyAllSgprOrVgprGPHI,
   ApplyINTRIN_IMAGE,
-  SplitFFB64To32
+  SplitBitCount64To32
 };
 
 enum FastRulesTypes {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 963bb91b6a7df..d1252f4154713 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -315,6 +315,8 @@ def : SourceOfDivergence<int_amdgcn_live_mask>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
+def : SourceOfDivergence<int_amdgcn_ds_add_gs_reg_rtn>;
+def : SourceOfDivergence<int_amdgcn_ds_sub_gs_reg_rtn>;
 def : SourceOfDivergence<int_amdgcn_permlane16>;
 def : SourceOfDivergence<int_amdgcn_permlanex16>;
 def : SourceOfDivergence<int_amdgcn_permlane16_var>;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 682ad74ea8af0..8e79abfb2f601 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -2039,8 +2039,8 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
 }
 
 static bool isCoexecutableVALUInst(const MachineInstr &MI) {
-  return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isTRANS(MI) &&
-         !SIInstrInfo::isWMMA(MI) && !SIInstrInfo::isSWMMAC(MI); // What else?
+  return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isWMMA(MI) &&
+         !SIInstrInfo::isSWMMAC(MI) && !SIInstrInfo::isLDSDMA(MI);
 }
 
 static bool IsWMMAHazardInstInCategory(const MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
index ade1b1518215c..9f8046391ebaf 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -23,7 +23,6 @@
 namespace llvm {
 
 class R600Subtarget;
-class AMDGPUTargetLowering;
 
 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
   using BaseT = BasicTTIImplBase<R600TTIImpl>;
@@ -32,14 +31,14 @@ class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
   friend BaseT;
 
   const R600Subtarget *ST;
-  const AMDGPUTargetLowering *TLI;
+  const TargetLowering *TLI;
   AMDGPUTTIImpl CommonTTI;
 
 public:
   explicit R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
 
   const R600Subtarget *getST() const { return ST; }
-  const AMDGPUTargetLowering *getTLI() const { return TLI; }
+  const TargetLowering *getTLI() const { return TLI; }
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 83bb2410864ff..c3f69cdb3cbed 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -343,13 +343,13 @@ class WaitcntGenerator {
   InstCounterType MaxCounter;
   bool OptNone;
   bool ExpandWaitcntProfiling = false;
-  const AMDGPU::HardwareLimits *Limits = nullptr;
+  const AMDGPU::HardwareLimits &Limits;
 
 public:
   WaitcntGenerator() = delete;
   WaitcntGenerator(const WaitcntGenerator &) = delete;
   WaitcntGenerator(const MachineFunction &MF, InstCounterType MaxCounter,
-                   const AMDGPU::HardwareLimits *Limits)
+                   const AMDGPU::HardwareLimits &Limits)
       : ST(MF.getSubtarget<GCNSubtarget>()), TII(*ST.getInstrInfo()),
         IV(AMDGPU::getIsaVersion(ST.getCPU())), MaxCounter(MaxCounter),
         OptNone(MF.getFunction().hasOptNone() ||
@@ -362,7 +362,7 @@ class WaitcntGenerator {
   // optimization.
   bool isOptNone() const { return OptNone; }
 
-  const AMDGPU::HardwareLimits &getLimits() const { return *Limits; }
+  const AMDGPU::HardwareLimits &getLimits() const { return Limits; }
 
   // Edits an existing sequence of wait count instructions according
   // to an incoming Waitcnt value, which is itself updated to reflect
@@ -467,7 +467,7 @@ class WaitcntGeneratorGFX12Plus final : public WaitcntGenerator {
   WaitcntGeneratorGFX12Plus() = delete;
   WaitcntGeneratorGFX12Plus(const MachineFunction &MF,
                             InstCounterType MaxCounter,
-                            const AMDGPU::HardwareLimits *Limits,
+                            const AMDGPU::HardwareLimits &Limits,
                             bool IsExpertMode)
       : WaitcntGenerator(MF, MaxCounter, Limits), IsExpertMode(IsExpertMode) {}
 
@@ -495,16 +495,6 @@ struct PreheaderFlushFlags {
 };
 
 class SIInsertWaitcnts {
-public:
-  const GCNSubtarget *ST;
-  const SIInstrInfo *TII = nullptr;
-  const SIRegisterInfo *TRI = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-  InstCounterType SmemAccessCounter;
-  InstCounterType MaxCounter;
-  bool IsExpertMode = false;
-
-private:
   DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
   DenseMap<MachineBasicBlock *, PreheaderFlushFlags> PreheadersToFlush;
   MachineLoopInfo &MLI;
@@ -519,7 +509,7 @@ class SIInsertWaitcnts {
 
   MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
-  bool ForceEmitWaitcnt[NUM_INST_CNTS];
+  bool ForceEmitWaitcnt[NUM_INST_CNTS] = {};
 
   std::unique_ptr<WaitcntGenerator> WCG;
 
@@ -535,9 +525,19 @@ class SIInsertWaitcnts {
   AMDGPU::HardwareLimits Limits;
 
 public:
+  const GCNSubtarget &ST;
+  const SIInstrInfo &TII;
+  const SIRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+  InstCounterType SmemAccessCounter;
+  InstCounterType MaxCounter;
+  bool IsExpertMode = false;
+
   SIInsertWaitcnts(MachineLoopInfo &MLI, MachinePostDominatorTree &PDT,
                    AliasAnalysis *AA, MachineFunction &MF)
-      : MLI(MLI), PDT(PDT), AA(AA), MF(MF) {
+      : MLI(MLI), PDT(PDT), AA(AA), MF(MF), ST(MF.getSubtarget<GCNSubtarget>()),
+        TII(*ST.getInstrInfo()), TRI(TII.getRegisterInfo()),
+        MRI(MF.getRegInfo()) {
     (void)ForceExpCounter;
     (void)ForceLgkmCounter;
     (void)ForceVMCounter;
@@ -612,15 +612,15 @@ class SIInsertWaitcnts {
     assert(SIInstrInfo::isVMEM(Inst));
     // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
     // these should use VM_CNT.
-    if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
+    if (!ST.hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
       return VMEM_ACCESS;
     if (Inst.mayStore() &&
         (!Inst.mayLoad() || SIInstrInfo::isAtomicNoRet(Inst))) {
-      if (TII->mayAccessScratch(Inst))
+      if (TII.mayAccessScratch(Inst))
         return SCRATCH_WRITE_ACCESS;
       return VMEM_WRITE_ACCESS;
     }
-    if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
+    if (!ST.hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
       return VMEM_ACCESS;
     return VmemReadMapping[getVmemType(Inst)];
   }
@@ -634,7 +634,7 @@ class SIInsertWaitcnts {
     if (SIInstrInfo::usesASYNC_CNT(MI))
       return true;
     const MachineOperand *Async =
-        TII->getNamedOperand(MI, AMDGPU::OpName::IsAsync);
+        TII.getNamedOperand(MI, AMDGPU::OpName::IsAsync);
     return Async && (Async->getImm());
   }
 
@@ -689,7 +689,7 @@ class SIInsertWaitcnts {
 class WaitcntBrackets {
 public:
   WaitcntBrackets(const SIInsertWaitcnts *Context) : Context(Context) {
-    assert(Context->TRI->getNumRegUnits() < REGUNITS_END);
+    assert(Context->TRI.getNumRegUnits() < REGUNITS_END);
   }
 
 #ifndef NDEBUG
@@ -889,13 +889,13 @@ class WaitcntBrackets {
 
   iterator_range<MCRegUnitIterator> regunits(MCPhysReg Reg) const {
     assert(Reg != AMDGPU::SCC && "Shouldn't be used on SCC");
-    if (!Context->TRI->isInAllocatableClass(Reg))
+    if (!Context->TRI.isInAllocatableClass(Reg))
       return {{}, {}};
-    const TargetRegisterClass *RC = Context->TRI->getPhysRegBaseClass(Reg);
-    unsigned Size = Context->TRI->getRegSizeInBits(*RC);
-    if (Size == 16 && Context->ST->hasD16Writes32BitVgpr())
-      Reg = Context->TRI->get32BitRegister(Reg);
-    return Context->TRI->regunits(Reg);
+    const TargetRegisterClass *RC = Context->TRI.getPhysRegBaseClass(Reg);
+    unsigned Size = Context->TRI.getRegSizeInBits(*RC);
+    if (Size == 16 && Context->ST.hasD16Writes32BitVgpr())
+      Reg = Context->TRI.get32BitRegister(Reg);
+    return Context->TRI.regunits(Reg);
   }
 
   void setScoreLB(InstCounterType T, unsigned Val) {
@@ -916,13 +916,13 @@ class WaitcntBrackets {
   }
 
   void setRegScore(MCPhysReg Reg, InstCounterType T, unsigned Val) {
-    const SIRegisterInfo *TRI = Context->TRI;
+    const SIRegisterInfo &TRI = Context->TRI;
     if (Reg == AMDGPU::SCC) {
       SCCScore = Val;
-    } else if (TRI->isVectorRegister(*Context->MRI, Reg)) {
+    } else if (TRI.isVectorRegister(Context->MRI, Reg)) {
       for (MCRegUnit RU : regunits(Reg))
         VMem[toVMEMID(RU)].Scores[T] = Val;
-    } else if (TRI->isSGPRReg(*Context->MRI, Reg)) {
+    } else if (TRI.isSGPRReg(Context->MRI, Reg)) {
       auto STy = getSgprScoresIdx(T);
       for (MCRegUnit RU : regunits(Reg))
         SGPRs[RU].Scores[STy] = Val;
@@ -1040,7 +1040,7 @@ void WaitcntBrackets::setScoreByOperand(const MachineOperand &Op,
 // this at compile time, so we have to assume it might be applied if the
 // instruction supports it).
 bool WaitcntBrackets::hasPointSampleAccel(const MachineInstr &MI) const {
-  if (!Context->ST->hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
+  if (!Context->ST.hasPointSampleAccel() || !SIInstrInfo::isMIMG(MI))
     return false;
 
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
@@ -1076,78 +1076,78 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
   PendingEvents.insert(E);
   setScoreUB(T, CurrScore);
 
-  const SIRegisterInfo *TRI = Context->TRI;
-  const MachineRegisterInfo *MRI = Context->MRI;
-  const SIInstrInfo *TII = Context->TII;
+  const SIRegisterInfo &TRI = Context->TRI;
+  const MachineRegisterInfo &MRI = Context->MRI;
+  const SIInstrInfo &TII = Context->TII;
 
   if (T == EXP_CNT) {
     // Put score on the source vgprs. If this is a store, just use those
     // specific register(s).
-    if (TII->isDS(Inst) && Inst.mayLoadOrStore()) {
+    if (TII.isDS(Inst) && Inst.mayLoadOrStore()) {
       // All GDS operations must protect their address register (same as
       // export.)
-      if (const auto *AddrOp = TII->getNamedOperand(Inst, AMDGPU::OpName::addr))
+      if (const auto *AddrOp = TII.getNamedOperand(Inst, AMDGPU::OpName::addr))
         setScoreByOperand(*AddrOp, EXP_CNT, CurrScore);
 
       if (Inst.mayStore()) {
         if (const auto *Data0 =
-                TII->getNamedOperand(Inst, AMDGPU::OpName::data0))
+                TII.getNamedOperand(Inst, AMDGPU::OpName::data0))
           setScoreByOperand(*Data0, EXP_CNT, CurrScore);
         if (const auto *Data1 =
-                TII->getNamedOperand(Inst, AMDGPU::OpName::data1))
+                TII.getNamedOperand(Inst, AMDGPU::OpName::data1))
           setScoreByOperand(*Data1, EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
                  Inst.getOpcode() != AMDGPU::DS_APPEND &&
                  Inst.getOpcode() != AMDGPU::DS_CONSUME &&
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (const MachineOperand &Op : Inst.all_uses()) {
-          if (TRI->isVectorRegister(*MRI, Op.getReg()))
+          if (TRI.isVectorRegister(MRI, Op.getReg()))
             setScoreByOperand(Op, EXP_CNT, CurrScore);
         }
       }
-    } else if (TII->isFLAT(Inst)) {
+    } else if (TII.isFLAT(Inst)) {
       if (Inst.mayStore()) {
-        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
-    } else if (TII->isMIMG(Inst)) {
+    } else if (TII.isMIMG(Inst)) {
       if (Inst.mayStore()) {
         setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
-    } else if (TII->isMTBUF(Inst)) {
+    } else if (TII.isMTBUF(Inst)) {
       if (Inst.mayStore())
         setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
-    } else if (TII->isMUBUF(Inst)) {
+    } else if (TII.isMUBUF(Inst)) {
       if (Inst.mayStore()) {
         setScoreByOperand(Inst.getOperand(0), EXP_CNT, CurrScore);
       } else if (SIInstrInfo::isAtomicRet(Inst)) {
-        setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::data),
+        setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::data),
                           EXP_CNT, CurrScore);
       }
-    } else if (TII->isLDSDIR(Inst)) {
+    } else if (TII.isLDSDIR(Inst)) {
       // LDSDIR instructions attach the score to the destination.
-      setScoreByOperand(*TII->getNamedOperand(Inst, AMDGPU::OpName::vdst),
+      setScoreByOperand(*TII.getNamedOperand(Inst, AMDGPU::OpName::vdst),
                         EXP_CNT, CurrScore);
     } else {
-      if (TII->isEXP(Inst)) {
+      if (TII.isEXP(Inst)) {
         // For export the destination registers are really temps that
         // can be used as the actual source after export patching, so
         // we need to treat them like sources and set the EXP_CNT
         // score.
         for (MachineOperand &DefMO : Inst.all_defs()) {
-          if (TRI->isVGPR(*MRI, DefMO.getReg())) {
+          if (TRI.isVGPR(MRI, DefMO.getReg())) {
             setScoreByOperand(DefMO, EXP_CNT, CurrScore);
           }
         }
       }
       for (const MachineOperand &Op : Inst.all_uses()) {
-        if (TRI->isVectorRegister(*MRI, Op.getReg()))
+        if (TRI.isVectorRegister(MRI, Op.getReg()))
           setScoreByOperand(Op, EXP_CNT, CurrScore);
       }
     }
@@ -1170,7 +1170,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
       if (!Op.isReg() || (T == VA_VDST && Op.isUse()) ||
           (T == VM_VSRC && Op.isDef()))
         continue;
-      if (TRI->isVectorRegister(*Context->MRI, Op.getReg()))
+      if (TRI.isVectorRegister(Context->MRI, Op.getReg()))
         setScoreByOperand(Op, T, CurrScore);
     }
   } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
@@ -1185,13 +1185,13 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
     // but none with memory instructions.
     for (const MachineOperand &Op : Inst.defs()) {
       if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
-        if (!TRI->isVectorRegister(*MRI, Op.getReg())) // TODO: add wrapper
+        if (!TRI.isVectorRegister(MRI, Op.getReg())) // TODO: add wrapper
           continue;
         if (updateVMCntOnly(Inst)) {
           // updateVMCntOnly should only leave us with VGPRs
           // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
           // defs. That's required for a sane index into `VgprMemTypes` below
-          assert(TRI->isVectorRegister(*MRI, Op.getReg()));
+          assert(TRI.isVectorRegister(MRI, Op.getReg()));
           VmemType V = getVmemType(Inst);
           unsigned char TypesMask = 1 << V;
           // If instruction can have Point Sample Accel applied, we have to flag
@@ -1205,7 +1205,7 @@ void WaitcntBrackets::updateByEvent(WaitEventType E, MachineInstr &Inst) {
       setScoreByOperand(Op, T, CurrScore);
     }
     if (Inst.mayStore() &&
-        (TII->isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
+        (TII.isDS(Inst) || Context->isNonAsyncLdsDmaWrite(Inst))) {
       // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
       // written can be accessed. A load from LDS to VMEM does not need a wait.
       //
@@ -1283,24 +1283,24 @@ void WaitcntBrackets::recordAsyncMark(MachineInstr &Inst) {
 }
 
 void WaitcntBrackets::print(raw_ostream &OS) const {
-  const GCNSubtarget *ST = Context->ST;
+  const GCNSubtarget &ST = Context->ST;
 
   for (auto T : inst_counter_types(Context->MaxCounter)) {
     unsigned SR = getScoreRange(T);
     switch (T) {
     case LOAD_CNT:
-      OS << "    " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
+      OS << "    " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
          << SR << "):";
       break;
     case DS_CNT:
-      OS << "    " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
+      OS << "    " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
          << SR << "):";
       break;
     case EXP_CNT:
       OS << "    EXP_CNT(" << SR << "):";
       break;
     case STORE_CNT:
-      OS << "    " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
+      OS << "    " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
          << SR << "):";
       break;
     case SAMPLE_CNT:
@@ -1393,18 +1393,18 @@ void WaitcntBrackets::print(raw_ostream &OS) const {
       unsigned MarkedScore = Mark[T];
       switch (T) {
       case LOAD_CNT:
-        OS << "  " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM")
+        OS << "  " << (ST.hasExtendedWaitCounts() ? "LOAD" : "VM")
            << "_CNT: " << MarkedScore;
         break;
       case DS_CNT:
-        OS << "  " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM")
+        OS << "  " << (ST.hasExtendedWaitCounts() ? "DS" : "LGKM")
            << "_CNT: " << MarkedScore;
         break;
       case EXP_CNT:
         OS << "  EXP_CNT: " << MarkedScore;
         break;
       case STORE_CNT:
-        OS << "  " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS")
+        OS << "  " << (ST.hasExtendedWaitCounts() ? "STORE" : "VS")
            << "_CNT: " << MarkedScore;
         break;
       case SAMPLE_CNT:
@@ -1515,7 +1515,7 @@ void WaitcntBrackets::determineWaitForScore(InstCounterType T,
   // If the score falls within the bracket, we need a waitcnt.
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
-        !Context->ST->hasFlatLgkmVMemCountInOrder()) {
+        !Context->ST.hasFlatLgkmVMemCountInOrder()) {
       // If there is a pending FLAT operation, and this is a VMem or LGKM
       // waitcnt and the target can report early completion, then we need
       // to force a waitcnt 0.
@@ -1583,7 +1583,7 @@ void WaitcntBrackets::determineWaitForPhysReg(InstCounterType T, MCPhysReg Reg,
   if (Reg == AMDGPU::SCC) {
     determineWaitForScore(T, SCCScore, Wait);
   } else {
-    bool IsVGPR = Context->TRI->isVectorRegister(*Context->MRI, Reg);
+    bool IsVGPR = Context->TRI.isVectorRegister(Context->MRI, Reg);
     for (MCRegUnit RU : regunits(Reg))
       determineWaitForScore(
           T, IsVGPR ? getVMemScore(toVMEMID(RU), T) : getSGPRScore(RU, T),
@@ -2433,7 +2433,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // (e.g., only GLOBAL_INV is pending), we can skip waiting on loadcnt.
     // GLOBAL_INV increments loadcnt but doesn't write to VGPRs, so there's
     // no need to wait for it at function boundaries.
-    if (ST->hasExtendedWaitCounts() &&
+    if (ST.hasExtendedWaitCounts() &&
         !ScoreBrackets.hasPendingEvent(VMEM_ACCESS))
       AllZeroWait.set(LOAD_CNT, ~0u);
     Wait = AllZeroWait;
@@ -2455,7 +2455,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   }
   case AMDGPU::S_SENDMSG:
   case AMDGPU::S_SENDMSGHALT: {
-    if (ST->hasLegacyGeometry() &&
+    if (ST.hasLegacyGeometry() &&
         ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
          AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
       // Resolve vm waits before gs-done.
@@ -2470,7 +2470,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     // export is granted (which can occur well after the instruction is issued).
     // The shader program must flush all EXP operations on the export-count
     // before overwriting the EXEC mask.
-    if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+    if (MI.modifiesRegister(AMDGPU::EXEC, &TRI)) {
       // Export and GDS are tracked individually, either may trigger a waitcnt
       // for EXEC.
       if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
@@ -2483,7 +2483,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
 
     // Wait for any pending GDS instruction to complete before any
     // "Always GDS" instruction.
-    if (TII->isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
+    if (TII.isAlwaysGDS(Opc) && ScoreBrackets.hasPendingGDS())
       addWait(Wait, DS_CNT, ScoreBrackets.getPendingGDSWait());
 
     if (MI.isCall()) {
@@ -2493,13 +2493,13 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       CallInsts.insert(&MI);
       Wait = AMDGPU::Waitcnt();
 
-      const MachineOperand &CallAddrOp = TII->getCalleeOperand(MI);
+      const MachineOperand &CallAddrOp = TII.getCalleeOperand(MI);
       if (CallAddrOp.isReg()) {
         ScoreBrackets.determineWaitForPhysReg(
             SmemAccessCounter, CallAddrOp.getReg().asMCReg(), Wait);
 
         if (const auto *RtnAddrOp =
-                TII->getNamedOperand(MI, AMDGPU::OpName::dst)) {
+                TII.getNamedOperand(MI, AMDGPU::OpName::dst)) {
           ScoreBrackets.determineWaitForPhysReg(
               SmemAccessCounter, RtnAddrOp->getReg().asMCReg(), Wait);
         }
@@ -2534,7 +2534,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
         if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
           continue;
         // No need to wait before load from VMEM to LDS.
-        if (TII->mayWriteLDSThroughDMA(MI))
+        if (TII.mayWriteLDSThroughDMA(MI))
           continue;
 
         // LOAD_CNT is only relevant to vgpr or LDS.
@@ -2567,12 +2567,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
           continue;
 
         // If the instruction does not read tied source, skip the operand.
-        if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
+        if (Op.isTied() && Op.isUse() && TII.doesNotReadTiedSource(MI))
           continue;
 
         MCPhysReg Reg = Op.getReg().asMCReg();
 
-        const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
+        const bool IsVGPR = TRI.isVectorRegister(MRI, Op.getReg());
         if (IsVGPR) {
           // Implicit VGPR defs and uses are never a part of the memory
           // instructions description and usually present to account for
@@ -2594,7 +2594,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
           if (Op.isUse() || !updateVMCntOnly(MI) ||
               ScoreBrackets.hasOtherPendingVmemTypes(Reg, getVmemType(MI)) ||
               ScoreBrackets.hasPointSamplePendingVmemTypes(MI, Reg) ||
-              !ST->hasVmemWriteVgprInOrder()) {
+              !ST.hasVmemWriteVgprInOrder()) {
             ScoreBrackets.determineWaitForPhysReg(LOAD_CNT, Reg, Wait);
             ScoreBrackets.determineWaitForPhysReg(SAMPLE_CNT, Reg, Wait);
             ScoreBrackets.determineWaitForPhysReg(BVH_CNT, Reg, Wait);
@@ -2611,7 +2611,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
           ScoreBrackets.determineWaitForPhysReg(SmemAccessCounter, Reg, Wait);
         }
 
-        if (ST->hasWaitXcnt() && Op.isDef())
+        if (ST.hasWaitXcnt() && Op.isDef())
           ScoreBrackets.determineWaitForPhysReg(X_CNT, Reg, Wait);
       }
     }
@@ -2630,15 +2630,15 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   //
   // In all other cases, ensure safety by ensuring that there are no outstanding
   // memory operations.
-  if (Opc == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier() &&
-      !ST->hasBackOffBarrier()) {
+  if (Opc == AMDGPU::S_BARRIER && !ST.hasAutoWaitcntBeforeBarrier() &&
+      !ST.hasBackOffBarrier()) {
     Wait = Wait.combined(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/true));
   }
 
   // TODO: Remove this work-around, enable the assert for Bug 457939
   //       after fixing the scheduler. Also, the Shader Compiler code is
   //       independent of target.
-  if (SIInstrInfo::isCBranchVCCZRead(MI) && ST->hasReadVCCZBug() &&
+  if (SIInstrInfo::isCBranchVCCZRead(MI) && ST.hasReadVCCZBug() &&
       ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
     Wait.set(DS_CNT, 0);
   }
@@ -2650,7 +2650,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
   // waits on VA_VDST if the instruction it would precede is not a VALU
   // instruction, since hardware handles VALU->VGPR->VALU hazards in
   // expert scheduling mode.
-  if (TII->isVALU(MI))
+  if (TII.isVALU(MI))
     Wait.set(VA_VDST, ~0u);
 
   // Since the translation for VMEM addresses occur in-order, we can apply the
@@ -2704,8 +2704,7 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
   // ExpCnt can be merged into VINTERP.
   if (Wait.get(EXP_CNT) != ~0u && It != Block.instr_end() &&
       SIInstrInfo::isVINTERP(*It)) {
-    MachineOperand *WaitExp =
-        TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
+    MachineOperand *WaitExp = TII.getNamedOperand(*It, AMDGPU::OpName::waitexp);
     if (Wait.get(EXP_CNT) < WaitExp->getImm()) {
       WaitExp->setImm(Wait.get(EXP_CNT));
       Modified = true;
@@ -2730,15 +2729,15 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
 
 std::optional<WaitEventType>
 SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
-  if (TII->isVALU(Inst)) {
+  if (TII.isVALU(Inst)) {
     // Core/Side-, DP-, XDL- and TRANS-MACC VALU instructions complete
     // out-of-order with respect to each other, so each of these classes
     // has its own event.
 
-    if (TII->isXDL(Inst))
+    if (TII.isXDL(Inst))
       return VGPR_XDL_WRITE;
 
-    if (TII->isTRANS(Inst))
+    if (TII.isTRANS(Inst))
       return VGPR_TRANS_WRITE;
 
     if (AMDGPU::isDPMACCInstruction(Inst.getOpcode()))
@@ -2751,13 +2750,13 @@ SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
   // with respect to each other and all other VMEM instructions, so
   // each of these also has a separate event.
 
-  if (TII->isFLAT(Inst))
+  if (TII.isFLAT(Inst))
     return VGPR_FLAT_READ;
 
-  if (TII->isDS(Inst))
+  if (TII.isDS(Inst))
     return VGPR_LDS_READ;
 
-  if (TII->isVMEM(Inst) || TII->isVIMAGE(Inst) || TII->isVSAMPLE(Inst))
+  if (TII.isVMEM(Inst) || TII.isVIMAGE(Inst) || TII.isVSAMPLE(Inst))
     return VGPR_VMEM_READ;
 
   // Otherwise, no hazard.
@@ -2766,8 +2765,8 @@ SIInsertWaitcnts::getExpertSchedulingEventType(const MachineInstr &Inst) const {
 }
 
 bool SIInsertWaitcnts::isVmemAccess(const MachineInstr &MI) const {
-  return (TII->isFLAT(MI) && TII->mayAccessVMEMThroughFlat(MI)) ||
-         (TII->isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
+  return (TII.isFLAT(MI) && TII.mayAccessVMEMThroughFlat(MI)) ||
+         (TII.isVMEM(MI) && !AMDGPU::getMUBUFIsBufferInv(MI.getOpcode()));
 }
 
 // Return true if the next instruction is S_ENDPGM, following fallthrough
@@ -2805,11 +2804,11 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
   AMDGPU::Waitcnt Wait;
   bool NeedsEndPGMCheck = false;
 
-  if (ST->isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
+  if (ST.isPreciseMemoryEnabled() && Inst.mayLoadOrStore())
     Wait = WCG->getAllZeroWaitcnt(Inst.mayStore() &&
                                   !SIInstrInfo::isAtomicRet(Inst));
 
-  if (TII->isAlwaysGDS(Inst.getOpcode())) {
+  if (TII.isAlwaysGDS(Inst.getOpcode())) {
     Wait.set(DS_CNT, 0);
     NeedsEndPGMCheck = true;
   }
@@ -2821,7 +2820,7 @@ bool SIInsertWaitcnts::insertForcedWaitAfter(MachineInstr &Inst,
                                 /*OldWaitcntInstr=*/nullptr);
 
   if (Result && NeedsEndPGMCheck && isNextENDPGM(SuccessorIt, &Block)) {
-    BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII->get(AMDGPU::S_NOP))
+    BuildMI(Block, SuccessorIt, Inst.getDebugLoc(), TII.get(AMDGPU::S_NOP))
         .addImm(0);
   }
 
@@ -2835,25 +2834,25 @@ WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
       Events.insert(*ET);
   }
 
-  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
-    if (TII->isAlwaysGDS(Inst.getOpcode()) ||
-        TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
+  if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
+    if (TII.isAlwaysGDS(Inst.getOpcode()) ||
+        TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
       Events.insert(GDS_ACCESS);
       Events.insert(GDS_GPR_LOCK);
     } else {
       Events.insert(LDS_ACCESS);
     }
-  } else if (TII->isFLAT(Inst)) {
+  } else if (TII.isFLAT(Inst)) {
     if (SIInstrInfo::isGFX12CacheInvOrWBInst(Inst.getOpcode())) {
       Events.insert(getVmemWaitEventType(Inst));
     } else {
       assert(Inst.mayLoadOrStore());
-      if (TII->mayAccessVMEMThroughFlat(Inst)) {
-        if (ST->hasWaitXcnt())
+      if (TII.mayAccessVMEMThroughFlat(Inst)) {
+        if (ST.hasWaitXcnt())
           Events.insert(VMEM_GROUP);
         Events.insert(getVmemWaitEventType(Inst));
       }
-      if (TII->mayAccessLDSThroughFlat(Inst))
+      if (TII.mayAccessLDSThroughFlat(Inst))
         Events.insert(LDS_ACCESS);
     }
   } else if (SIInstrInfo::isVMEM(Inst) &&
@@ -2862,21 +2861,21 @@ WaitEventSet SIInsertWaitcnts::getEventsFor(const MachineInstr &Inst) const {
     // BUFFER_WBL2 is included here because unlike invalidates, has to be
     // followed "S_WAITCNT vmcnt(0)" is needed after to ensure the writeback has
     // completed.
-    if (ST->hasWaitXcnt())
+    if (ST.hasWaitXcnt())
       Events.insert(VMEM_GROUP);
     Events.insert(getVmemWaitEventType(Inst));
-    if (ST->vmemWriteNeedsExpWaitcnt() &&
+    if (ST.vmemWriteNeedsExpWaitcnt() &&
         (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
       Events.insert(VMW_GPR_LOCK);
     }
-  } else if (TII->isSMRD(Inst)) {
-    if (ST->hasWaitXcnt())
+  } else if (TII.isSMRD(Inst)) {
+    if (ST.hasWaitXcnt())
       Events.insert(SMEM_GROUP);
     Events.insert(SMEM_ACCESS);
   } else if (SIInstrInfo::isLDSDIR(Inst)) {
     Events.insert(EXP_LDS_ACCESS);
   } else if (SIInstrInfo::isEXP(Inst)) {
-    unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+    unsigned Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
     if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
       Events.insert(EXP_PARAM_ACCESS);
     else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
@@ -2913,14 +2912,14 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
       ScoreBrackets->updateByEvent(E, Inst);
   }
 
-  if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
-    if (TII->isAlwaysGDS(Inst.getOpcode()) ||
-        TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
+  if (TII.isDS(Inst) && TII.usesLGKM_CNT(Inst)) {
+    if (TII.isAlwaysGDS(Inst.getOpcode()) ||
+        TII.hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
       ScoreBrackets->setPendingGDS();
     }
-  } else if (TII->isFLAT(Inst)) {
-    if (Inst.mayLoadOrStore() && TII->mayAccessVMEMThroughFlat(Inst) &&
-        TII->mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst))
+  } else if (TII.isFLAT(Inst)) {
+    if (Inst.mayLoadOrStore() && TII.mayAccessVMEMThroughFlat(Inst) &&
+        TII.mayAccessLDSThroughFlat(Inst) && !SIInstrInfo::isLDSDMA(Inst))
       // Async/LDSDMA operations have FLAT encoding but do not actually use flat
       // pointers. They do have two operands that each access global and LDS,
       // thus making it appear at this point that they are using a flat pointer.
@@ -2931,8 +2930,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
     // Act as a wait on everything
     ScoreBrackets->applyWaitcnt(WCG->getAllZeroWaitcnt(/*IncludeVSCnt=*/false));
     ScoreBrackets->setStateOnFunctionEntryOrReturn();
-  } else if (TII->isVINTERP(Inst)) {
-    int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
+  } else if (TII.isVINTERP(Inst)) {
+    int64_t Imm = TII.getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
     ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
   }
 }
@@ -3116,15 +3115,15 @@ void SIInsertWaitcnts::setSchedulingMode(MachineBasicBlock &MBB,
                                          bool ExpertMode) const {
   const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
       AMDGPU::Hwreg::ID_SCHED_MODE, AMDGPU::Hwreg::HwregOffset::Default, 2);
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_SETREG_IMM32_B32))
+  BuildMI(MBB, I, DebugLoc(), TII.get(AMDGPU::S_SETREG_IMM32_B32))
       .addImm(ExpertMode ? 2 : 0)
       .addImm(EncodedReg);
 }
 
 namespace {
 // TODO: Remove this work-around after fixing the scheduler.
-// There are two reasons why vccz might be incorrect; see ST->hasReadVCCZBug()
-// and ST->partialVCCWritesUpdateVCCZ().
+// There are two reasons why vccz might be incorrect; see ST.hasReadVCCZBug()
+// and ST.partialVCCWritesUpdateVCCZ().
 // i. VCCZBug: There is a hardware bug on CI/SI where SMRD instruction may
 //    corrupt vccz bit, so when we detect that an instruction may read from
 //    a corrupt vccz bit, we need to:
@@ -3235,7 +3234,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     Block.printName(dbgs());
     ScoreBrackets.dump();
   });
-  VCCZWorkaround VCCZW(ScoreBrackets, *ST, *TII, *TRI);
+  VCCZWorkaround VCCZW(ScoreBrackets, ST, TII, TRI);
 
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
@@ -3271,12 +3270,12 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       // Asyncmarks record the current wait state and so should not allow
       // waitcnts that occur after them to be merged into waitcnts that occur
       // before.
-      assert(ST->getGeneration() < AMDGPUSubtarget::GFX12);
+      assert(ST.getGeneration() < AMDGPUSubtarget::GFX12);
       ScoreBrackets.recordAsyncMark(Inst);
       continue;
     }
 
-    if (TII->isSMRD(Inst)) {
+    if (TII.isSMRD(Inst)) {
       for (const MachineMemOperand *Memop : Inst.memoperands()) {
         // No need to handle invariant loads when avoiding WAR conflicts, as
         // there cannot be a vector store to the same memory location.
@@ -3347,7 +3346,7 @@ bool SIInsertWaitcnts::removeRedundantSoftXcnts(MachineBasicBlock &Block) {
   for (MachineInstr &MI : drop_begin(Block)) {
     // Ignore last atomic if non-LDS VMEM and SMEM.
     bool IsLDS =
-        TII->isDS(MI) || (TII->isFLAT(MI) && TII->mayAccessLDSThroughFlat(MI));
+        TII.isDS(MI) || (TII.isFLAT(MI) && TII.mayAccessLDSThroughFlat(MI));
     if (!IsLDS && (MI.mayLoad() ^ MI.mayStore()))
       LastAtomicWithSoftXcnt = nullptr;
 
@@ -3395,7 +3394,7 @@ SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
 
 bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
   if (SIInstrInfo::isFLAT(MI))
-    return TII->mayAccessVMEMThroughFlat(MI);
+    return TII.mayAccessVMEMThroughFlat(MI);
   return SIInstrInfo::isVMEM(MI);
 }
 
@@ -3445,7 +3444,7 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
   // DS optimization only applies to GFX12+ where DS_CNT is separate.
   // Tracking status for "no DS read in loop" or "pure DS prefetch
   // (use only in next iteration)".
-  bool TrackSimpleDSOpt = ST->hasExtendedWaitCounts();
+  bool TrackSimpleDSOpt = ST.hasExtendedWaitCounts();
   DenseSet<MCRegUnit> VgprUse;
   DenseSet<MCRegUnit> VgprDefVMEM;
   DenseSet<MCRegUnit> VgprDefDS;
@@ -3457,7 +3456,7 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
   DenseMap<MCRegUnit, unsigned> LastDSReadPositionMap;
   unsigned DSReadPosition = 0;
   bool IsSingleBlock = ML->getNumBlocks() == 1;
-  bool TrackDSFlushPoint = ST->hasExtendedWaitCounts() && IsSingleBlock;
+  bool TrackDSFlushPoint = ST.hasExtendedWaitCounts() && IsSingleBlock;
   unsigned LastDSFlushPosition = 0;
 
   for (MachineBasicBlock *MBB : ML->blocks()) {
@@ -3494,10 +3493,10 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
       };
 
       for (const MachineOperand &Op : MI.all_uses()) {
-        if (Op.isDebug() || !TRI->isVectorRegister(*MRI, Op.getReg()))
+        if (Op.isDebug() || !TRI.isVectorRegister(MRI, Op.getReg()))
           continue;
         // Vgpr use
-        for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
+        for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
           // If we find a register that is loaded inside the loop, 1. and 2.
           // are invalidated.
           if (VgprDefVMEM.contains(RU))
@@ -3533,7 +3532,7 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
       // VMem load vgpr def
       if (isVMEMOrFlatVMEM(MI) && MI.mayLoad()) {
         for (const MachineOperand &Op : MI.all_defs()) {
-          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
+          for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
             // If we find a register that is loaded inside the loop, 1. and 2.
             // are invalidated.
             if (VgprUse.contains(RU))
@@ -3555,9 +3554,9 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
       // checked above when processing uses).
       if (IsDSRead || TrackDSFlushPoint) {
         for (const MachineOperand &Op : MI.all_defs()) {
-          if (!TRI->isVectorRegister(*MRI, Op.getReg()))
+          if (!TRI.isVectorRegister(MRI, Op.getReg()))
             continue;
-          for (MCRegUnit RU : TRI->regunits(Op.getReg().asMCReg())) {
+          for (MCRegUnit RU : TRI.regunits(Op.getReg().asMCReg())) {
             // Check for overwrite of pending DS read (flush point) by any
             // instruction
             updateDSReadFlushTracking(RU);
@@ -3574,8 +3573,8 @@ SIInsertWaitcnts::getPreheaderFlushFlags(MachineLoop *ML,
 
   // VMEM flush decision
   if (!VMemInvalidated && UsesVgprVMEMLoadedOutside &&
-      ((!ST->hasVscnt() && HasVMemStore && !HasVMemLoad) ||
-       (HasVMemLoad && ST->hasVmemWriteVgprInOrder())))
+      ((!ST.hasVscnt() && HasVMemStore && !HasVMemLoad) ||
+       (HasVMemLoad && ST.hasVmemWriteVgprInOrder())))
     Flags.FlushVmCnt = true;
 
   // DS flush decision:
@@ -3624,19 +3623,15 @@ SIInsertWaitcntsPass::run(MachineFunction &MF,
 }
 
 bool SIInsertWaitcnts::run() {
-  ST = &MF.getSubtarget<GCNSubtarget>();
-  TII = ST->getInstrInfo();
-  TRI = &TII->getRegisterInfo();
-  MRI = &MF.getRegInfo();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+  AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
 
   // Initialize hardware limits first, as they're needed by the generators.
   Limits = AMDGPU::HardwareLimits(IV);
 
-  if (ST->hasExtendedWaitCounts()) {
-    IsExpertMode = ST->hasExpertSchedulingMode() &&
+  if (ST.hasExtendedWaitCounts()) {
+    IsExpertMode = ST.hasExpertSchedulingMode() &&
                    (ExpertSchedulingModeFlag.getNumOccurrences()
                         ? ExpertSchedulingModeFlag
                         : MF.getFunction()
@@ -3644,21 +3639,17 @@ bool SIInsertWaitcnts::run() {
                               .getValueAsBool());
     MaxCounter = IsExpertMode ? NUM_EXPERT_INST_CNTS : NUM_EXTENDED_INST_CNTS;
     // Initialize WCG per MF. It contains state that depends on MF attributes.
-    WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, &Limits,
+    WCG = std::make_unique<WaitcntGeneratorGFX12Plus>(MF, MaxCounter, Limits,
                                                       IsExpertMode);
   } else {
     MaxCounter = NUM_NORMAL_INST_CNTS;
     // Initialize WCG per MF. It contains state that depends on MF attributes.
     WCG = std::make_unique<WaitcntGeneratorPreGFX12>(MF, NUM_NORMAL_INST_CNTS,
-                                                     &Limits);
+                                                     Limits);
   }
 
-  for (auto T : inst_counter_types())
-    ForceEmitWaitcnt[T] = false;
-
   SmemAccessCounter = getCounterFromEvent(SMEM_ACCESS);
 
-  BlockInfos.clear();
   bool Modified = false;
 
   MachineBasicBlock &EntryBB = MF.front();
@@ -3674,29 +3665,29 @@ bool SIInsertWaitcnts::run() {
     while (I != EntryBB.end() && I->isMetaInstruction())
       ++I;
 
-    if (ST->hasExtendedWaitCounts()) {
-      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+    if (ST.hasExtendedWaitCounts()) {
+      BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
           .addImm(0);
       for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
         if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT || CT == X_CNT)
           continue;
 
-        if (!ST->hasImageInsts() &&
+        if (!ST.hasImageInsts() &&
             (CT == EXP_CNT || CT == SAMPLE_CNT || CT == BVH_CNT))
           continue;
 
         BuildMI(EntryBB, I, DebugLoc(),
-                TII->get(instrsForExtendedCounterTypes[CT]))
+                TII.get(instrsForExtendedCounterTypes[CT]))
             .addImm(0);
       }
       if (IsExpertMode) {
-        unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, *ST);
+        unsigned Enc = AMDGPU::DepCtr::encodeFieldVaVdst(0, ST);
         Enc = AMDGPU::DepCtr::encodeFieldVmVsrc(Enc, 0);
-        BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_DEPCTR))
+        BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT_DEPCTR))
             .addImm(Enc);
       }
     } else {
-      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+      BuildMI(EntryBB, I, DebugLoc(), TII.get(AMDGPU::S_WAITCNT)).addImm(0);
     }
 
     auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(this);
@@ -3740,7 +3731,7 @@ bool SIInsertWaitcnts::run() {
         }
       }
 
-      if (ST->hasWaitXcnt())
+      if (ST.hasWaitXcnt())
         Modified |= removeRedundantSoftXcnts(*MBB);
       Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
       BI.Dirty = false;
@@ -3784,13 +3775,13 @@ bool SIInsertWaitcnts::run() {
     }
   } while (Repeat);
 
-  if (ST->hasScalarStores()) {
+  if (ST.hasScalarStores()) {
     SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
     bool HaveScalarStores = false;
 
     for (MachineBasicBlock &MBB : MF) {
       for (MachineInstr &MI : MBB) {
-        if (!HaveScalarStores && TII->isScalarStore(MI))
+        if (!HaveScalarStores && TII.isScalarStore(MI))
           HaveScalarStores = true;
 
         if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
@@ -3815,7 +3806,7 @@ bool SIInsertWaitcnts::run() {
              I != E; ++I) {
           if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
             SeenDCacheWB = true;
-          else if (TII->isScalarStore(*I))
+          else if (TII.isScalarStore(*I))
             SeenDCacheWB = false;
 
           // FIXME: It would be better to insert this before a waitcnt if any.
@@ -3823,7 +3814,7 @@ bool SIInsertWaitcnts::run() {
                I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
               !SeenDCacheWB) {
             Modified = true;
-            BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+            BuildMI(*MBB, I, I->getDebugLoc(), TII.get(AMDGPU::S_DCACHE_WB));
           }
         }
       }
@@ -3860,37 +3851,31 @@ bool SIInsertWaitcnts::run() {
   if (!WCG->isOptNone() && MFI->isDynamicVGPREnabled()) {
     for (auto [MI, _] : EndPgmInsts) {
       BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-              TII->get(AMDGPU::S_ALLOC_VGPR))
+              TII.get(AMDGPU::S_ALLOC_VGPR))
           .addImm(0);
       Modified = true;
     }
   } else if (!WCG->isOptNone() &&
-             ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+             ST.getGeneration() >= AMDGPUSubtarget::GFX11 &&
              (MF.getFrameInfo().hasCalls() ||
-              ST->getOccupancyWithNumVGPRs(
-                  TRI->getNumUsedPhysRegs(*MRI, AMDGPU::VGPR_32RegClass),
+              ST.getOccupancyWithNumVGPRs(
+                  TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass),
                   /*IsDynamicVGPR=*/false) <
-                  AMDGPU::IsaInfo::getMaxWavesPerEU(ST))) {
+                  AMDGPU::IsaInfo::getMaxWavesPerEU(&ST))) {
     for (auto [MI, Flag] : EndPgmInsts) {
       if (Flag) {
-        if (ST->requiresNopBeforeDeallocVGPRs()) {
+        if (ST.requiresNopBeforeDeallocVGPRs()) {
           BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                  TII->get(AMDGPU::S_NOP))
+                  TII.get(AMDGPU::S_NOP))
               .addImm(0);
         }
         BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
-                TII->get(AMDGPU::S_SENDMSG))
+                TII.get(AMDGPU::S_SENDMSG))
             .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
         Modified = true;
       }
     }
   }
 
-  CallInsts.clear();
-  ReturnInsts.clear();
-  EndPgmInsts.clear();
-  PreheadersToFlush.clear();
-  SLoadAddresses.clear();
-
   return Modified;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0ab3f079c4299..de118f3dbbf14 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4628,7 +4628,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   //       given the typical code patterns.
   if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
       isEXP(Opcode) || Opcode == AMDGPU::DS_ORDERED_COUNT ||
-      Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT)
+      Opcode == AMDGPU::S_TRAP || Opcode == AMDGPU::S_WAIT_EVENT ||
+      Opcode == AMDGPU::S_SETHALT)
     return true;
 
   if (MI.isCall() || MI.isInlineAsm())
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 0b8c71a4a2453..ee8465697cb54 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -442,30 +442,11 @@ bool Vreg1LoweringHelper::lowerCopiesFromI1() {
 PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF,
                                      MachineDominatorTree *DT,
                                      MachinePostDominatorTree *PDT)
-    : MF(MF), DT(DT), PDT(PDT) {
+    : MF(MF), DT(DT), PDT(PDT), ST(&MF->getSubtarget<GCNSubtarget>()),
+      LMC(&AMDGPU::LaneMaskConstants::get(*ST)) {
   MRI = &MF->getRegInfo();
 
-  ST = &MF->getSubtarget<GCNSubtarget>();
   TII = ST->getInstrInfo();
-  IsWave32 = ST->isWave32();
-
-  if (IsWave32) {
-    ExecReg = AMDGPU::EXEC_LO;
-    MovOp = AMDGPU::S_MOV_B32;
-    AndOp = AMDGPU::S_AND_B32;
-    OrOp = AMDGPU::S_OR_B32;
-    XorOp = AMDGPU::S_XOR_B32;
-    AndN2Op = AMDGPU::S_ANDN2_B32;
-    OrN2Op = AMDGPU::S_ORN2_B32;
-  } else {
-    ExecReg = AMDGPU::EXEC;
-    MovOp = AMDGPU::S_MOV_B64;
-    AndOp = AMDGPU::S_AND_B64;
-    OrOp = AMDGPU::S_OR_B64;
-    XorOp = AMDGPU::S_XOR_B64;
-    AndN2Op = AMDGPU::S_ANDN2_B64;
-    OrN2Op = AMDGPU::S_ORN2_B64;
-  }
 }
 
 bool PhiLoweringHelper::lowerPhis() {
@@ -677,7 +658,7 @@ bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const {
       return false;
   }
 
-  if (MI->getOpcode() != MovOp)
+  if (MI->getOpcode() != LMC->MovOpc)
     return false;
 
   if (!MI->getOperand(1).isImm())
@@ -795,10 +776,10 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
     if (PrevVal == CurVal) {
       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
     } else if (CurVal) {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(ExecReg);
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(LMC->ExecReg);
     } else {
-      BuildMI(MBB, I, DL, TII->get(XorOp), DstReg)
-          .addReg(ExecReg)
+      BuildMI(MBB, I, DL, TII->get(LMC->XorOpc), DstReg)
+          .addReg(LMC->ExecReg)
           .addImm(-1);
     }
     return;
@@ -811,9 +792,9 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
       PrevMaskedReg = PrevReg;
     } else {
       PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
-      BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
+      BuildMI(MBB, I, DL, TII->get(LMC->AndN2Opc), PrevMaskedReg)
           .addReg(PrevReg)
-          .addReg(ExecReg);
+          .addReg(LMC->ExecReg);
     }
   }
   if (!CurConstant) {
@@ -822,9 +803,9 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
       CurMaskedReg = CurReg;
     } else {
       CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
-      BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
+      BuildMI(MBB, I, DL, TII->get(LMC->AndOpc), CurMaskedReg)
           .addReg(CurReg)
-          .addReg(ExecReg);
+          .addReg(LMC->ExecReg);
     }
   }
 
@@ -835,13 +816,13 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
         .addReg(PrevMaskedReg);
   } else if (PrevConstant && PrevVal) {
-    BuildMI(MBB, I, DL, TII->get(OrN2Op), DstReg)
+    BuildMI(MBB, I, DL, TII->get(LMC->OrN2Opc), DstReg)
         .addReg(CurMaskedReg)
-        .addReg(ExecReg);
+        .addReg(LMC->ExecReg);
   } else {
-    BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
+    BuildMI(MBB, I, DL, TII->get(LMC->OrOpc), DstReg)
         .addReg(PrevMaskedReg)
-        .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
+        .addReg(CurMaskedReg ? CurMaskedReg : LMC->ExecReg);
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
index fd90328c2b926..8eb587cc025d6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPULaneMaskUtils.h"
 #include "GCNSubtarget.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
@@ -20,7 +21,7 @@
 
 namespace llvm {
 
-/// Incoming for lane maks phi as machine instruction, incoming register \p Reg
+/// Incoming for lane mask phi as machine instruction, incoming register \p Reg
 /// and incoming block \p Block are taken from machine instruction.
 /// \p UpdatedReg (if valid) is \p Reg lane mask merged with another lane mask.
 struct Incoming {
@@ -50,19 +51,12 @@ class PhiLoweringHelper {
   const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
   MachineRegisterInfo::VRegAttrs LaneMaskRegAttrs;
+  const AMDGPU::LaneMaskConstants *LMC = nullptr;
 
 #ifndef NDEBUG
   DenseSet<Register> PhiRegisters;
 #endif
 
-  Register ExecReg;
-  unsigned MovOp;
-  unsigned AndOp;
-  unsigned OrOp;
-  unsigned XorOp;
-  unsigned AndN2Op;
-  unsigned OrN2Op;
-
 public:
   bool lowerPhis();
   bool isConstantLaneMask(Register Reg, bool &Val) const;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 9160a42b7b37c..dc56d746e1a8e 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1495,6 +1495,14 @@ unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                         unsigned DynamicVGPRBlockSize) {
   assert(WavesPerEU != 0);
 
+  // In dynamic VGPR mode, (static) occupancy does not depend on VGPR usage,
+  // so getMaxNumVGPRs does not depend on WavesPerEU, and thus we need to return
+  // zero because there is no nonzero VGPR usage N where going below N
+  // achieves higher (static) occupancy.
+  bool DynamicVGPREnabled = (DynamicVGPRBlockSize != 0);
+  if (DynamicVGPREnabled)
+    return 0;
+
   unsigned MaxWavesPerEU = getMaxWavesPerEU(STI);
   if (WavesPerEU >= MaxWavesPerEU)
     return 0;
@@ -1522,9 +1530,13 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
                         unsigned DynamicVGPRBlockSize) {
   assert(WavesPerEU != 0);
 
+  // In dynamic VGPR mode, WavesPerEU does not imply a VGPR limit.
+  bool DynamicVGPREnabled = (DynamicVGPRBlockSize != 0);
   unsigned MaxNumVGPRs =
-      alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
-                getVGPRAllocGranule(STI, DynamicVGPRBlockSize));
+      DynamicVGPREnabled
+          ? getTotalNumVGPRs(STI)
+          : alignDown(getTotalNumVGPRs(STI) / WavesPerEU,
+                      getVGPRAllocGranule(STI, DynamicVGPRBlockSize));
   unsigned AddressableNumVGPRs =
       getAddressableNumVGPRs(STI, DynamicVGPRBlockSize);
   return std::min(MaxNumVGPRs, AddressableNumVGPRs);
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 228ed68e386b4..970c962197ac0 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -5199,6 +5199,12 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
 
       return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
     }
+
+    // (SELECT_CC setlt, x, 0, 1, 0) -> SRL(x, bw-1)
+    if (CC == ISD::SETLT && isNullConstant(RHS) && isOneConstant(TrueVal) &&
+        isNullConstant(FalseVal) && LHS.getValueType() == VT)
+      return DAG.getNode(ISD::SRL, dl, VT, LHS,
+                         DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
   }
 
   if (LHS.getValueType() == MVT::i32) {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 5b655962685d3..f8dbc7907f64b 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -530,7 +530,7 @@ void ARMPassConfig::addPreEmitPass() {
   // Unpack bundles for:
   // - Thumb2: Constant island pass requires unbundled instructions
   // - KCFI: KCFI_CHECK pseudo instructions need to be unbundled for AsmPrinter
-  addPass(createUnpackMachineBundles([](const MachineFunction &MF) {
+  addPass(createUnpackMachineBundlesLegacy([](const MachineFunction &MF) {
     return MF.getSubtarget<ARMSubtarget>().isThumb2() ||
            MF.getFunction().getParent()->getModuleFlag("kcfi");
   }));
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index 7717490b3e78d..393b556c9b210 100644
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -2496,26 +2496,36 @@ template <> bool AVRExpandPseudo::expand<AVR::ZEXT>(Block &MBB, BlockIt MBBI) {
 
 template <>
 bool AVRExpandPseudo::expand<AVR::SPREAD>(Block &MBB, BlockIt MBBI) {
+  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
   MachineInstr &MI = *MBBI;
   Register DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   unsigned Flags = MI.getFlags();
-  unsigned OpLo = AVR::INRdA;
-  unsigned OpHi = AVR::INRdA;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // Low part
-  buildMI(MBB, MBBI, OpLo)
+  buildMI(MBB, MBBI, AVR::INRdA)
       .addReg(DstLoReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addImm(0x3d)
+      .addImm(STI.getIORegSPL())
       .setMIFlags(Flags);
 
   // High part
-  buildMI(MBB, MBBI, OpHi)
-      .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
-      .addImm(0x3e)
-      .setMIFlags(Flags);
+  if (STI.getIORegSPH() != -1) {
+    buildMI(MBB, MBBI, AVR::INRdA)
+        .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addImm(STI.getIORegSPH())
+        .setMIFlags(Flags);
+  } else {
+    // Clear the upper byte if there is no SPH.
+    auto MI0 =
+        buildMI(MBB, MBBI, AVR::EORRdRr)
+            .addReg(DstHiReg, RegState::Define | getDeadRegState(DstIsDead))
+            .addReg(DstHiReg, RegState::Kill)
+            .addReg(DstHiReg);
+    // SREG is implicitly dead.
+    MI0->getOperand(3).setIsDead();
+  }
 
   MI.eraseFromParent();
   return true;
@@ -2536,7 +2546,6 @@ bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
   // a write to SPL will automatically disable interrupts
   // for up to four instructions or until the next I/O memory write.
   if (STI.getELFArch() >= 102) { // An XMEGA device
-
     buildMI(MBB, MBBI, AVR::OUTARr)
         .addImm(STI.getIORegSPL())
         .addReg(SrcLoReg, getKillRegState(SrcIsKill))
@@ -2546,26 +2555,26 @@ bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
         .addImm(STI.getIORegSPH())
         .addReg(SrcHiReg, getKillRegState(SrcIsKill))
         .setMIFlags(Flags);
+  } else { // Disable interrupts for older devices with SPH (3 extra
+           // instructions)
+    if (STI.getIORegSPH() != -1) {
+      buildMI(MBB, MBBI, AVR::INRdA)
+          .addReg(STI.getTmpRegister(), RegState::Define)
+          .addImm(STI.getIORegSREG())
+          .setMIFlags(Flags);
 
-  } else { // Disable interrupts for older devices (3 extra instructions)
-
-    buildMI(MBB, MBBI, AVR::INRdA)
-        .addReg(STI.getTmpRegister(), RegState::Define)
-        .addImm(STI.getIORegSREG())
-        .setMIFlags(Flags);
-
-    buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
+      buildMI(MBB, MBBI, AVR::BCLRs).addImm(0x07).setMIFlags(Flags);
 
-    if (STI.getIORegSPH() != -1)
       buildMI(MBB, MBBI, AVR::OUTARr)
           .addImm(STI.getIORegSPH())
           .addReg(SrcHiReg, getKillRegState(SrcIsKill))
           .setMIFlags(Flags);
 
-    buildMI(MBB, MBBI, AVR::OUTARr)
-        .addImm(STI.getIORegSREG())
-        .addReg(STI.getTmpRegister(), RegState::Kill)
-        .setMIFlags(Flags);
+      buildMI(MBB, MBBI, AVR::OUTARr)
+          .addImm(STI.getIORegSREG())
+          .addReg(STI.getTmpRegister(), RegState::Kill)
+          .setMIFlags(Flags);
+    }
 
     buildMI(MBB, MBBI, AVR::OUTARr)
         .addImm(STI.getIORegSPL())
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index b5913b8575f5d..255ad3315365f 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -25,7 +25,7 @@ class BPFMCAsmInfo : public MCAsmInfoELF {
       IsLittleEndian = false;
 
     InternalSymbolPrefix = ".L";
-    PrivateLabelPrefix = "L";
+    PrivateLabelPrefix = ".L";
     WeakRefDirective = "\t.weak\t";
 
     UsesELFSectionDirectiveForBSS = true;
diff --git a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
index 95577dd668e1e..5fdc498db57e1 100644
--- a/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
+++ b/llvm/lib/Target/DirectX/DXContainerGlobals.cpp
@@ -194,10 +194,10 @@ void DXContainerGlobals::addResourcesForPSV(Module &M, PSVRuntimeInfo &PSV) {
         BindInfo.Type = Type;
         BindInfo.LowerBound = Binding.LowerBound;
         assert(
-            (Binding.Size == UINT32_MAX ||
+            (Binding.Size == 0 ||
              (uint64_t)Binding.LowerBound + Binding.Size - 1 <= UINT32_MAX) &&
             "Resource range is too large");
-        BindInfo.UpperBound = (Binding.Size == UINT32_MAX)
+        BindInfo.UpperBound = (Binding.Size == 0)
                                   ? UINT32_MAX
                                   : Binding.LowerBound + Binding.Size - 1;
         BindInfo.Space = Binding.Space;
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 0c0830cc92aa7..794bae8fd3a34 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -354,9 +354,8 @@ class OpLowerer {
 
       // For `CreateHandleFromBinding` we need the upper bound rather than the
       // size, so we need to be careful about the difference for "unbounded".
-      uint32_t Unbounded = std::numeric_limits<uint32_t>::max();
-      uint32_t UpperBound = Binding.Size == Unbounded
-                                ? Unbounded
+      uint32_t UpperBound = Binding.Size == 0
+                                ? std::numeric_limits<uint32_t>::max()
                                 : Binding.LowerBound + Binding.Size - 1;
       Constant *ResBind = OpBuilder.getResBind(Binding.LowerBound, UpperBound,
                                                Binding.Space, RC);
diff --git a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
index 9da3bdb8d59b2..12e54ad2cc73e 100644
--- a/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrettyPrinter.cpp
@@ -214,7 +214,7 @@ struct FormatBindingSize
 
   void format(llvm::raw_ostream &OS, StringRef Style) override {
     uint32_t Size = Item.getBinding().Size;
-    if (Size == std::numeric_limits<uint32_t>::max())
+    if (Size == 0)
       OS << "unbounded";
     else
       OS << Size;
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index 0c1b0aea41f41..0212dffa38645 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -316,6 +316,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   case Type::HalfTyID:
   case Type::FloatTyID:
   case Type::DoubleTyID:
+  case Type::ByteTyID:
   case Type::IntegerTyID: {
     const DataLayout &DL = GV->getDataLayout();
     // It is unfortunate that DL's function take non-const Type*.
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index 0696813040654..6a8869c5d708a 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -136,8 +136,6 @@ static void fixupLeb128(MCContext &Ctx, const MCFixup &Fixup, uint8_t *Data,
   unsigned I;
   for (I = 0; Value; ++I, Value >>= 7)
     Data[I] |= uint8_t(Value & 0x7f);
-  if (Value)
-    Ctx.reportError(Fixup.getLoc(), "Invalid uleb128 value!");
 }
 
 void LoongArchAsmBackend::applyFixup(const MCFragment &F, const MCFixup &Fixup,
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 26f5f3f5160f5..a942989dbe5dd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -6164,8 +6164,8 @@ class Tcgen05MMAInst<bit IsSparse, string ASpace, string Kind, int CtaGroup,
 
   let AsmString = Prefix
                   # SpCtaKindStr
-                  # ".collector::a::" # CollectorUsage
                   # !if(IsAShift, ".ashift", "")
+                  # ".collector::a::" # CollectorUsage
                   # BaseOperandsStr
                   # InputDStr
                   # ScaleInpStr
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index b07f95018ca90..7b07c0881d453 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -458,7 +458,7 @@ struct PPCOperand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(RRegs[getRegNum()]));
   }
 
-  void addRegGPRCNoR0Operands(MCInst &Inst, unsigned N) const {
+  void addRegGPRC_NOR0Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(RRegsNoR0[getRegNum()]));
   }
@@ -468,7 +468,7 @@ struct PPCOperand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(XRegs[getRegNum()]));
   }
 
-  void addRegG8RCNoX0Operands(MCInst &Inst, unsigned N) const {
+  void addRegG8RC_NOX0Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(XRegsNoX0[getRegNum()]));
   }
@@ -487,9 +487,9 @@ struct PPCOperand : public MCParsedAsmOperand {
 
   void addRegGxRCNoR0Operands(MCInst &Inst, unsigned N) const {
     if (isPPC64())
-      addRegG8RCNoX0Operands(Inst, N);
+      addRegG8RC_NOX0Operands(Inst, N);
     else
-      addRegGPRCNoR0Operands(Inst, N);
+      addRegGPRC_NOR0Operands(Inst, N);
   }
 
   void addRegF4RCOperands(MCInst &Inst, unsigned N) const {
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index c0abbf6f50804..7b4bae60f7e74 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -292,6 +292,10 @@ def FeatureP10Vector  : SubtargetFeature<"power10-vector", "HasP10Vector",
                                          "true",
                                          "Enable POWER10 vector instructions",
                                          [FeatureISA3_1, FeatureP9Vector]>;
+def FeatureFutureVector : SubtargetFeature<"future-vector", "HasFutureVector",
+                                           "true",
+                                           "Enable Future vector instructions",
+                                           [FeatureISAFuture, FeatureP10Vector]>;
 // A separate feature for this even though it is equivalent to P9Vector
 // because this is a feature of the implementation rather than the architecture
 // and may go away with future CPU's.
@@ -310,7 +314,7 @@ def FeaturePCRelativeMemops :
 def FeaturePairedVectorMemops:
   SubtargetFeature<"paired-vector-memops", "PairedVectorMemops", "true",
                    "32Byte load and store instructions",
-                   [FeatureISA3_0]>;
+                   [FeatureISA3_0, FeatureVSX]>;
 def FeatureMMA : SubtargetFeature<"mma", "HasMMA", "true",
                                   "Enable MMA instructions",
                                   [FeatureP8Vector, FeatureP9Altivec,
@@ -400,6 +404,7 @@ def HasP9Altivec : Predicate<"Subtarget->hasP9Altivec()">;
 def HasOnlySwappingMemOps : Predicate<"!Subtarget->hasP9Vector()">;
 def NoP10Vector : Predicate<"!Subtarget->hasP10Vector()">;
 def HasP10Vector : Predicate<"Subtarget->hasP10Vector()">;
+def HasFutureVector : Predicate<"Subtarget->hasFutureVector()">;
 
 // Predicates used to differenciate between different ISAs.
 def IsISA2_06 : Predicate<"Subtarget->isISA2_06()">;
@@ -554,7 +559,8 @@ def ProcessorFeatures {
   // For future CPU we assume that all of the existing features from Power11
   // still exist with the exception of those we know are Power11 specific.
   list<SubtargetFeature> FutureAdditionalFeatures = [DirectivePwrFuture,
-                                                     FeatureISAFuture];
+                                                     FeatureISAFuture,
+                                                     FeatureFutureVector];
   list<SubtargetFeature> FutureSpecificFeatures = [];
   list<SubtargetFeature> FutureInheritableFeatures =
     !listconcat(P11InheritableFeatures, FutureAdditionalFeatures);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 200d026dadd7b..1515ff2e13b85 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12165,12 +12165,18 @@ SDValue PPCTargetLowering::LowerVectorLoad(SDValue Op,
     return Op;
 
   // Type v256i1 is used for pairs and v512i1 is used for accumulators.
-  // Here we create 2 or 4 v16i8 loads to load the pair or accumulator value in
-  // 2 or 4 vsx registers.
   assert((VT != MVT::v512i1 || Subtarget.hasMMA()) &&
          "Type unsupported without MMA");
   assert((VT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
          "Type unsupported without paired vector support");
+
+  // For v256i1 on ISA Future, let the load go through to instruction selection
+  // where it will be matched to lxvp/plxvp by the instruction patterns.
+  if (VT == MVT::v256i1 && Subtarget.isISAFuture())
+    return Op;
+
+  // For other cases, create 2 or 4 v16i8 loads to load the pair or accumulator
+  // value in 2 or 4 vsx registers.
   Align Alignment = LN->getAlign();
   SmallVector<SDValue, 4> Loads;
   SmallVector<SDValue, 4> LoadChains;
@@ -12333,12 +12339,19 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
     return Op;
 
   // Type v256i1 is used for pairs and v512i1 is used for accumulators.
-  // Here we create 2 or 4 v16i8 stores to store the pair or accumulator
-  // underlying registers individually.
   assert((StoreVT != MVT::v512i1 || Subtarget.hasMMA()) &&
          "Type unsupported without MMA");
   assert((StoreVT != MVT::v256i1 || Subtarget.pairedVectorMemops()) &&
          "Type unsupported without paired vector support");
+
+  // For v256i1 on ISA Future, let the store go through to instruction selection
+  // where it will be matched to stxvp/pstxvp by the instruction patterns.
+  if (StoreVT == MVT::v256i1 && Subtarget.isISAFuture() &&
+      !DisableAutoPairedVecSt)
+    return Op;
+
+  // For other cases, create 2 or 4 v16i8 stores to store the pair or
+  // accumulator underlying registers individually.
   Align Alignment = SN->getAlign();
   SmallVector<SDValue, 4> Stores;
   unsigned NumVecs = 2;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFuture.td b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
index 717454f78e2a4..0cd63a88cb96b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFuture.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFuture.td
@@ -390,7 +390,7 @@ let Predicates = [IsISAFuture, PrefixInstrs] in {
                                            "paddis $RT, $RA, $SI", IIC_LdStLFD>;
 }
 
-let Predicates = [HasVSX, IsISAFuture] in {
+let Predicates = [HasFutureVector] in {
   let mayLoad = 1 in {
     def LXVRL : XX1Form_memOp<31, 525, (outs vsrc:$XT),
                               (ins (memr $RA):$addr, g8rc:$RB),
@@ -571,7 +571,7 @@ let Predicates = [HasVSX, IsISAFuture] in {
                       "xsrebase3t3uqm $XT, $XA, $XB", []>;
 }
 
-let Predicates = [HasVSX, PrefixInstrs, IsISAFuture] in {
+let Predicates = [HasFutureVector, PrefixInstrs] in {
   def XXSSUMUDMCEXT
       : 8RR_XX4Form_XTABC6_P<
             34, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB, vsrc:$XC, u1imm:$P),
@@ -596,7 +596,18 @@ def : Pat<(int_ppc_vsx_stxvprl v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRL $XTp,
                                                                     $RA, $RB)>;
 def : Pat<(int_ppc_vsx_stxvprll v256i1:$XTp, addr:$RA, i64:$RB), (STXVPRLL $XTp,
                                                                      $RA, $RB)>;
-let Predicates = [HasVSX, IsISAFuture] in {
+
+// Regular load/store patterns for v256i1 (for ISA Future)
+let Predicates = [HasFutureVector, PairedVectorMemops] in {
+  def : Pat<(v256i1 (load iaddrX16:$src)), (LXVP iaddrX16:$src)>;
+  def : Pat<(v256i1 (load PDForm:$src)), (PLXVP memri34:$src)>;
+  def : Pat<(v256i1 (load xoaddr:$src)), (LXVPX xoaddr:$src)>;
+  def : Pat<(store v256i1:$XSp, iaddrX16:$dst), (STXVP $XSp, iaddrX16:$dst)>;
+  def : Pat<(store v256i1:$XSp, PDForm:$dst), (PSTXVP $XSp, memri34:$dst)>;
+  def : Pat<(store v256i1:$XSp, xoaddr:$dst), (STXVPX $XSp, xoaddr:$dst)>;
+}
+
+let Predicates = [HasFutureVector] in {
   def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)), (v4i32 (XVRLW v4i32:$vA,
                                                      v4i32:$vB))>;
 }
@@ -605,7 +616,7 @@ let Predicates = [HasVSX, IsISAFuture] in {
 // Predicate combinations available:
 // [HasVSX, IsISAFuture]
 
-let Predicates = [HasVSX, IsISAFuture] in {
+let Predicates = [HasFutureVector] in {
   def : InstAlias<"xxaes128encp $XTp, $XAp, $XBp",
                   (XXAESENCP vsrprc:$XTp, vsrprc:$XAp, vsrprc:$XBp, 0)>;
   def : InstAlias<"xxaes192encp $XTp, $XAp, $XBp",
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index cc5b7d90fd189..3271e4d279f56 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -733,6 +733,7 @@ def nzFPImmExactInti5 : PatLeaf<(fpimm), [{
   return IsExact && IntResult <= 15 && IntResult >= -16 && !FloatValue.isZero();
 }]>;
 
+// Floating point zero immediates (positive and negative)
 def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
 def fpimm0neg : PatLeaf<(fpimm), [{return N->isExactlyValue(-0.0);}]>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterClasses.td b/llvm/lib/Target/PowerPC/PPCRegisterClasses.td
new file mode 100644
index 0000000000000..dac74eb352d8c
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCRegisterClasses.td
@@ -0,0 +1,101 @@
+//===-- PPCRegisterClasses.td - Register Class Definitions -*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines base classes for PowerPC register classes to reduce
+// repetition and make it easier to define new register classes.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Base Register Class Definitions
+//===----------------------------------------------------------------------===//
+
+// Base class for all PPC register classes - sets namespace to "PPC"
+class PPCRegisterClass<list<ValueType> regTypes, int alignment, dag regList>
+  : RegisterClass<"PPC", regTypes, alignment, regList>;
+
+//===----------------------------------------------------------------------===//
+// Variant Register Class Definitions
+//===----------------------------------------------------------------------===//
+
+// Register class that is not allocatable
+class PPCNonAllocatableRegisterClass<list<ValueType> regTypes, int alignment,
+                                     dag regList>
+  : PPCRegisterClass<regTypes, alignment, regList> {
+  let isAllocatable = 0;
+}
+
+// Register class with explicit size
+class PPCRegisterClassWithSize<list<ValueType> regTypes, int alignment,
+                               dag regList, int size>
+  : PPCRegisterClass<regTypes, alignment, regList> {
+  let Size = size;
+}
+
+// Register class with allocation priority and size
+class PPCRegisterClassWithPriority<list<ValueType> regTypes, int alignment,
+                                   dag regList, int allocPriority,
+                                   bit globalPriority, int size>
+  : PPCRegisterClass<regTypes, alignment, regList> {
+  let AllocationPriority = allocPriority;
+  let GlobalPriority = globalPriority;
+  let Size = size;
+}
+
+// GPR-style register class with alternative orders for different ABIs
+// Merged PPCRegisterClassWithAltOrders into this class since it was only used
+// here.
+class PPCGPRRegisterClass<list<ValueType> regTypes, int alignment, dag regList,
+                          dag altOrder1, dag altOrder2>
+  : PPCRegisterClass<regTypes, alignment, regList> {
+  let AltOrders = [altOrder1, altOrder2];
+  let AltOrderSelect = [{
+    return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Classes for Generating RegisterOperand for Existing RegisterClass
+//===----------------------------------------------------------------------===//
+
+// Creates a RegisterOperand for an already-defined RegisterClass, assuming the
+// AsmOperandClass already exists with the standard naming convention:
+//    (PPCReg<regClassName>AsmOperand).
+// Usage: def spe4rc : PPCRegOperandOnly<"GPRC">;
+//   Creates: spe4rc RegisterOperand wrapping GPRC, using PPCRegGPRCAsmOperand.
+class PPCRegOperandOnly<string regClassName>
+  : RegisterOperand<!cast<RegisterClass>(regClassName)> {
+  let ParserMatchClass =
+        !cast<AsmOperandClass>(!strconcat("PPCReg", regClassName, "AsmOperand"));
+}
+
+// Multiclass that generates both the AsmOperandClass and RegisterOperand for an
+// already-defined RegisterClass. This eliminates the repetitive pattern of
+// manually defining these for each register class.
+//
+// Usage examples:
+//   defm GPRC : PPCRegOperand<"isRegNumber">;
+//     Creates: PPCRegGPRCAsmOperand and gprc (lowercase of GPRC)
+//   defm FpRC : PPCRegOperand<"isEvenRegNumber", "fpairrc">;
+//     Creates: PPCRegFpRCAsmOperand and fpairrc (custom name)
+//
+// The NAME in the defm statement  match the RegisterClass name.
+multiclass PPCRegOperand<string predicate, string operandName = ""> {
+  // Define the AsmOperandClass.
+  def "PPCReg"#NAME#"AsmOperand" : AsmOperandClass {
+    let Name = "Reg"#NAME;
+    let PredicateMethod = predicate;
+  }
+
+  // Define the RegisterOperand with custom name if provided,
+  // otherwise use lowercase of NAME.
+  def !if(!eq(operandName, ""), !tolower(NAME), operandName)
+    : RegisterOperand<!cast<RegisterClass>(NAME)> {
+    let ParserMatchClass = !cast<AsmOperandClass>("PPCReg"#NAME#"AsmOperand");
+  }
+}
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 28cfbc9991c3c..90c7be4297935 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 include "PPCOperands.td"
+include "PPCRegisterClasses.td"
 
 let Namespace = "PPC" in {
   def sub_lt : SubRegIndex<1>;
@@ -167,9 +168,9 @@ let isArtificial = 1 in {
   }
 }
 
-let isAllocatable = 0, CopyCost = -1 in {
-  def VFHRC : RegisterClass<"PPC", [f64], 64, (sequence "VFH%u", 0, 31)>;
-  def FHRC : RegisterClass<"PPC", [f64], 64, (sequence "FH%u", 0, 31)>;
+let CopyCost = -1 in {
+  def VFHRC : PPCNonAllocatableRegisterClass<[f64], 64, (sequence "VFH%u", 0, 31)>;
+  def FHRC : PPCNonAllocatableRegisterClass<[f64], 64, (sequence "FH%u", 0, 31)>;
 }
 
 // Floating-point pair registers
@@ -296,71 +297,60 @@ def CARRY: SPR<1, "xer">, DwarfRegNum<[76]> {
 // that do nothing but change RM will not get deleted.
 def RM: PPCReg<"**ROUNDING MODE**">;
 
-let isAllocatable = 0 in
-def GPRC32 : RegisterClass<"PPC", [i32,f32], 32, (add (sequence "H%u", 2, 12),
-                                                      (sequence "H%u", 30, 13),
-                                                      H31, H0, H1)>;
+def GPRC32 : PPCNonAllocatableRegisterClass<[i32,f32], 32,
+               (add (sequence "H%u", 2, 12),
+                    (sequence "H%u", 30, 13),
+                    H31, H0, H1)>;
 
 /// Register classes
 // Allocate volatiles first
 // then nonvolatiles in reverse order since stmw/lmw save from rN to r31
-def GPRC : RegisterClass<"PPC", [i32,f32], 32, (add (sequence "R%u", 2, 12),
-                                                    (sequence "R%u", 30, 13),
-                                                    R31, R0, R1, FP, BP)> {
-  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
-  // put it at the end of the list.
-  // On AIX, CSRs are allocated starting from R31 according to:
-  // https://www.ibm.com/docs/en/ssw_aix_72/assembler/assembler_pdf.pdf.
-  // This also helps setting the correct `NumOfGPRsSaved' in traceback table.
-  let AltOrders = [(add (sub GPRC, R2), R2),
-                   (add (sequence "R%u", 2, 12),
-                        (sequence "R%u", 31, 13), R0, R1, FP, BP)];
-  let AltOrderSelect = [{
-    return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
-  }];
-}
-
-def G8RC : RegisterClass<"PPC", [i64], 64, (add (sequence "X%u", 2, 12),
-                                                (sequence "X%u", 30, 14),
-                                                X31, X13, X0, X1, FP8, BP8)> {
-  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
-  // put it at the end of the list.
-  let AltOrders = [(add (sub G8RC, X2), X2),
-                   (add (sequence "X%u", 2, 12),
-                        (sequence "X%u", 31, 13), X0, X1, FP8, BP8)];
-  let AltOrderSelect = [{
-    return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
-  }];
-}
+def GPRC : PPCGPRRegisterClass<[i32,f32], 32,
+             (add (sequence "R%u", 2, 12),
+                  (sequence "R%u", 30, 13),
+                  R31, R0, R1, FP, BP),
+             // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+             // put it at the end of the list.
+             (add (sub GPRC, R2), R2),
+             // On AIX, CSRs are allocated starting from R31 according to:
+             // https://www.ibm.com/docs/en/ssw_aix_72/assembler/assembler_pdf.pdf.
+             // This also helps setting the correct `NumOfGPRsSaved' in traceback table.
+             (add (sequence "R%u", 2, 12),
+                  (sequence "R%u", 31, 13), R0, R1, FP, BP)>;
+
+def G8RC : PPCGPRRegisterClass<[i64], 64,
+             (add (sequence "X%u", 2, 12),
+                  (sequence "X%u", 30, 14),
+                  X31, X13, X0, X1, FP8, BP8),
+             // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+             // put it at the end of the list.
+             (add (sub G8RC, X2), X2),
+             (add (sequence "X%u", 2, 12),
+                  (sequence "X%u", 31, 13), X0, X1, FP8, BP8)>;
 
 // For some instructions r0 is special (representing the value 0 instead of
 // the value in the r0 register), and we use these register subclasses to
 // prevent r0 from being allocated for use by those instructions.
-def GPRC_NOR0 : RegisterClass<"PPC", [i32,f32], 32, (add (sub GPRC, R0), ZERO)> {
-  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
-  // put it at the end of the list.
-  let AltOrders = [(add (sub GPRC_NOR0, R2), R2),
-                   (add (sequence "R%u", 2, 12),
-                        (sequence "R%u", 31, 13), R1, FP, BP, ZERO)];
-  let AltOrderSelect = [{
-    return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
-  }];
-}
-
-def G8RC_NOX0 : RegisterClass<"PPC", [i64], 64, (add (sub G8RC, X0), ZERO8)> {
-  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
-  // put it at the end of the list.
-  let AltOrders = [(add (sub G8RC_NOX0, X2), X2),
-                   (add (sequence "X%u", 2, 12),
-                        (sequence "X%u", 31, 13), X1, FP8, BP8, ZERO8)];
-  let AltOrderSelect = [{
-    return MF.getSubtarget<PPCSubtarget>().getGPRAllocationOrderIdx();
-  }];
-}
-
-def SPERC : RegisterClass<"PPC", [f64], 64, (add (sequence "S%u", 2, 12),
-                                                (sequence "S%u", 30, 13),
-                                                S31, S0, S1)>;
+def GPRC_NOR0 : PPCGPRRegisterClass<[i32,f32], 32,
+                  (add (sub GPRC, R0), ZERO),
+                  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+                  // put it at the end of the list.
+                  (add (sub GPRC_NOR0, R2), R2),
+                  (add (sequence "R%u", 2, 12),
+                       (sequence "R%u", 31, 13), R1, FP, BP, ZERO)>;
+
+def G8RC_NOX0 : PPCGPRRegisterClass<[i64], 64,
+                  (add (sub G8RC, X0), ZERO8),
+                  // On non-Darwin PPC64 systems, R2 can be allocated, but must be restored, so
+                  // put it at the end of the list.
+                  (add (sub G8RC_NOX0, X2), X2),
+                  (add (sequence "X%u", 2, 12),
+                       (sequence "X%u", 31, 13), X1, FP8, BP8, ZERO8)>;
+
+def SPERC : PPCRegisterClass<[f64], 64,
+              (add (sequence "S%u", 2, 12),
+                   (sequence "S%u", 30, 13),
+                   S31, S0, S1)>;
 
 // Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
 // ABI the size of the Floating-point register save area is determined by the
@@ -369,9 +359,9 @@ def SPERC : RegisterClass<"PPC", [f64], 64, (add (sequence "S%u", 2, 12),
 // previous stack frame. By allocating non-volatiles in reverse order we make
 // sure that the Floating-point register save area is always as small as
 // possible because there aren't any unused spill slots.
-def F8RC : RegisterClass<"PPC", [f64], 64, (add (sequence "F%u", 0, 13),
-                                                (sequence "F%u", 31, 14))>;
-def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
+def F8RC : PPCRegisterClass<[f64], 64, (add (sequence "F%u", 0, 13),
+                                            (sequence "F%u", 31, 14))>;
+def F4RC : PPCRegisterClass<[f32], 32, (add F8RC)>;
 
 // Floating point pair registers.
 // Note that the type used for this register class is ppcf128. This is not
@@ -380,47 +370,45 @@ def F4RC : RegisterClass<"PPC", [f32], 32, (add F8RC)>;
 // scheduling any of these instructions it should be safe to do this.
 // The reason we didn't use the correct type (Decimal Floating Point) is that
 // at the time of this implementation the correct type was not available.
-def FpRC :
-  RegisterClass<"PPC", [ppcf128], 128,
-                (add Fpair0, Fpair2, Fpair4, Fpair6, Fpair8, Fpair10, Fpair12,
-                     Fpair14, Fpair16, Fpair18, Fpair20, Fpair22, Fpair24,
-                     Fpair26, Fpair28, Fpair30)> {
-  let Size = 128;
-}
+def FpRC : PPCRegisterClassWithSize<[ppcf128], 128,
+             (add Fpair0, Fpair2, Fpair4, Fpair6, Fpair8, Fpair10, Fpair12,
+                  Fpair14, Fpair16, Fpair18, Fpair20, Fpair22, Fpair24,
+                  Fpair26, Fpair28, Fpair30), 128>;
 
-def VRRC : RegisterClass<"PPC",
-                         [v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64, f128],
-                         128,
-                         (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
-                             V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
-                             V29, V28, V27, V26, V25, V24, V23, V22, V21, V20)>;
+def VRRC : PPCRegisterClassWithSize<[v16i8,v8i16,v4i32,v2i64,v1i128,v4f32,v2f64, f128],
+             128,
+             (add V2, V3, V4, V5, V0, V1, V6, V7, V8, V9, V10, V11,
+                  V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
+                  V29, V28, V27, V26, V25, V24, V23, V22, V21, V20), 128>;
 
 // VSX register classes (the allocation order mirrors that of the corresponding
 // subregister classes).
-def VSLRC : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
-                          (add (sequence "VSL%u", 0, 13),
-                               (sequence "VSL%u", 31, 14))>;
-def VSRC  : RegisterClass<"PPC", [v4i32,v4f32,v2f64,v2i64], 128,
-                          (add VSLRC, VRRC)>;
+def VSLRC : PPCRegisterClassWithSize<[v4i32,v4f32,v2f64,v2i64], 128,
+              (add (sequence "VSL%u", 0, 13),
+                   (sequence "VSL%u", 31, 14)), 128>;
+
+def VSRC  : PPCRegisterClassWithSize<[v4i32,v4f32,v2f64,v2i64], 128,
+              (add VSLRC, VRRC), 128>;
 
 // Register classes for the 64-bit "scalar" VSX subregisters.
-def VFRC :  RegisterClass<"PPC", [f64], 64,
-                          (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7,
-                               VF8, VF9, VF10, VF11, VF12, VF13, VF14,
-                               VF15, VF16, VF17, VF18, VF19, VF31, VF30,
-                               VF29, VF28, VF27, VF26, VF25, VF24, VF23,
-                               VF22, VF21, VF20)>;
-def VSFRC : RegisterClass<"PPC", [f64], 64, (add F8RC, VFRC)>;
+def VFRC :  PPCRegisterClass<[f64], 64,
+              (add VF2, VF3, VF4, VF5, VF0, VF1, VF6, VF7,
+                   VF8, VF9, VF10, VF11, VF12, VF13, VF14,
+                   VF15, VF16, VF17, VF18, VF19, VF31, VF30,
+                   VF29, VF28, VF27, VF26, VF25, VF24, VF23,
+                   VF22, VF21, VF20)>;
+
+def VSFRC : PPCRegisterClass<[f64], 64, (add F8RC, VFRC)>;
 
 // Allow spilling GPR's into caller-saved VSR's.
-def SPILLTOVSRRC : RegisterClass<"PPC", [i64, f64], 64, (add G8RC, (sub VSFRC,
-				(sequence "VF%u", 31, 20),
-				(sequence "F%u", 31, 14)))>;
+def SPILLTOVSRRC : PPCRegisterClass<[i64, f64], 64,
+                     (add G8RC, (sub VSFRC, (sequence "VF%u", 31, 20),
+                                            (sequence "F%u", 31, 14)))>;
 
 // Register class for single precision scalars in VSX registers
-def VSSRC : RegisterClass<"PPC", [f32], 32, (add VSFRC)>;
+def VSSRC : PPCRegisterClass<[f32], 32, (add VSFRC)>;
 
-def CRBITRC : RegisterClass<"PPC", [i1], 32,
+def CRBITRC : PPCRegisterClassWithSize<[i1], 32,
   (add CR2LT, CR2GT, CR2EQ, CR2UN,
        CR3LT, CR3GT, CR3EQ, CR3UN,
        CR4LT, CR4GT, CR4EQ, CR4UN,
@@ -428,8 +416,7 @@ def CRBITRC : RegisterClass<"PPC", [i1], 32,
        CR6LT, CR6GT, CR6EQ, CR6UN,
        CR7LT, CR7GT, CR7EQ, CR7UN,
        CR1LT, CR1GT, CR1EQ, CR1UN,
-       CR0LT, CR0GT, CR0EQ, CR0UN)> {
-  let Size = 32;
+       CR0LT, CR0GT, CR0EQ, CR0UN), 32> {
   let AltOrders = [(sub CRBITRC, CR2LT, CR2GT, CR2EQ, CR2UN, CR3LT, CR3GT,
                         CR3EQ, CR3UN, CR4LT, CR4GT, CR4EQ, CR4UN)];
   let AltOrderSelect = [{
@@ -438,7 +425,7 @@ def CRBITRC : RegisterClass<"PPC", [i1], 32,
   }];
 }
 
-def CRRC : RegisterClass<"PPC", [i32], 32,
+def CRRC : PPCRegisterClass<[i32], 32,
   (add CR0, CR1, CR5, CR6,
        CR7, CR2, CR3, CR4)> {
   let AltOrders = [(sub CRRC, CR2, CR3, CR4)];
@@ -447,47 +434,34 @@ def CRRC : RegisterClass<"PPC", [i32], 32,
            MF.getInfo<PPCFunctionInfo>()->isNonVolatileCRDisabled();
   }];
 }
+
 // The CTR registers are not allocatable because they're used by the
 // decrement-and-branch instructions, and thus need to stay live across
 // multiple basic blocks.
-def CTRRC : RegisterClass<"PPC", [i32], 32, (add CTR)> {
-  let isAllocatable = 0;
-}
-def CTRRC8 : RegisterClass<"PPC", [i64], 64, (add CTR8)> {
-  let isAllocatable = 0;
-}
+def CTRRC : PPCNonAllocatableRegisterClass<[i32], 32, (add CTR)>;
+def CTRRC8 : PPCNonAllocatableRegisterClass<[i64], 64, (add CTR8)>;
 
-def LRRC : RegisterClass<"PPC", [i32], 32, (add LR)> {
-  let isAllocatable = 0;
-}
-def LR8RC : RegisterClass<"PPC", [i64], 64, (add LR8)> {
-  let isAllocatable = 0;
-}
+def LRRC : PPCNonAllocatableRegisterClass<[i32], 32, (add LR)>;
+def LR8RC : PPCNonAllocatableRegisterClass<[i64], 64, (add LR8)>;
 
-def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
-def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> {
+def VRSAVERC : PPCRegisterClass<[i32], 32, (add VRSAVE)>;
+def CARRYRC : PPCNonAllocatableRegisterClass<[i32], 32, (add CARRY, XER)> {
   let CopyCost = -1;
-  let isAllocatable = 0;
 }
 
 // Make AllocationOrder as similar as G8RC's to avoid potential spilling.
 // Similarly, we have an AltOrder for 64-bit ELF ABI which r2 is allocated
 // at last.
-def G8pRC :
-  RegisterClass<"PPC", [untyped], 128,
-                (add (sequence "G8p%u", 1, 5),
-                     (sequence "G8p%u", 14, 7),
-                     G8p15, G8p6, G8p0)> {
+def G8pRC : PPCRegisterClassWithSize<[untyped], 128,
+              (add (sequence "G8p%u", 1, 5),
+                   (sequence "G8p%u", 14, 7),
+                   G8p15, G8p6, G8p0), 128> {
   let AltOrders = [(add (sub G8pRC, G8p1), G8p1)];
   let AltOrderSelect = [{
     return MF.getSubtarget<PPCSubtarget>().is64BitELFABI();
   }];
-  let Size = 128;
 }
 
-include "PPCRegisterInfoMMA.td"
-include "PPCRegisterInfoDMR.td"
-
 //===----------------------------------------------------------------------===//
 // PowerPC Register Operand Definitions.
 
@@ -502,90 +476,26 @@ include "PPCRegisterInfoDMR.td"
 // For this purpose, we define one RegisterOperand for each RegisterClass,
 // using the same name as the class, just in lower case.
 
-def PPCRegGPRCAsmOperand : AsmOperandClass {
-  let Name = "RegGPRC"; let PredicateMethod = "isRegNumber";
-}
-def gprc : RegisterOperand<GPRC> {
-  let ParserMatchClass = PPCRegGPRCAsmOperand;
-}
-def PPCRegG8RCAsmOperand : AsmOperandClass {
-  let Name = "RegG8RC"; let PredicateMethod = "isRegNumber";
-}
-def g8rc : RegisterOperand<G8RC> {
-  let ParserMatchClass = PPCRegG8RCAsmOperand;
-}
-def PPCRegG8pRCAsmOperand : AsmOperandClass {
-  let Name = "RegG8pRC"; let PredicateMethod = "isEvenRegNumber";
-}
-def g8prc : RegisterOperand<G8pRC> {
-  let ParserMatchClass = PPCRegG8pRCAsmOperand;
-}
-def PPCRegGPRCNoR0AsmOperand : AsmOperandClass {
-  let Name = "RegGPRCNoR0"; let PredicateMethod = "isRegNumber";
-}
-def gprc_nor0 : RegisterOperand<GPRC_NOR0> {
-  let ParserMatchClass = PPCRegGPRCNoR0AsmOperand;
-}
-def PPCRegG8RCNoX0AsmOperand : AsmOperandClass {
-  let Name = "RegG8RCNoX0"; let PredicateMethod = "isRegNumber";
-}
-def g8rc_nox0 : RegisterOperand<G8RC_NOX0> {
-  let ParserMatchClass = PPCRegG8RCNoX0AsmOperand;
-}
-def PPCRegF8RCAsmOperand : AsmOperandClass {
-  let Name = "RegF8RC"; let PredicateMethod = "isRegNumber";
-}
-def f8rc : RegisterOperand<F8RC> {
-  let ParserMatchClass = PPCRegF8RCAsmOperand;
-}
-def PPCRegF4RCAsmOperand : AsmOperandClass {
-  let Name = "RegF4RC"; let PredicateMethod = "isRegNumber";
-}
-def f4rc : RegisterOperand<F4RC> {
-  let ParserMatchClass = PPCRegF4RCAsmOperand;
-}
-def PPCRegFpRCAsmOperand : AsmOperandClass {
-  let Name = "RegFpRC"; let PredicateMethod = "isEvenRegNumber";
-}
-def fpairrc : RegisterOperand<FpRC> {
-  let ParserMatchClass = PPCRegFpRCAsmOperand;
-}
-def PPCRegVRRCAsmOperand : AsmOperandClass {
-  let Name = "RegVRRC"; let PredicateMethod = "isRegNumber";
-}
-def vrrc : RegisterOperand<VRRC> {
-  let ParserMatchClass = PPCRegVRRCAsmOperand;
-}
-def PPCRegVFRCAsmOperand : AsmOperandClass {
-  let Name = "RegVFRC"; let PredicateMethod = "isRegNumber";
-}
-def vfrc : RegisterOperand<VFRC> {
-  let ParserMatchClass = PPCRegVFRCAsmOperand;
-}
-def PPCRegCRBITRCAsmOperand : AsmOperandClass {
-  let Name = "RegCRBITRC"; let PredicateMethod = "isCRBitNumber";
-}
-def crbitrc : RegisterOperand<CRBITRC> {
-  let ParserMatchClass = PPCRegCRBITRCAsmOperand;
-}
-def PPCRegCRRCAsmOperand : AsmOperandClass {
-  let Name = "RegCRRC"; let PredicateMethod = "isCCRegNumber";
-}
-def crrc : RegisterOperand<CRRC> {
-  let ParserMatchClass = PPCRegCRRCAsmOperand;
-}
-def PPCRegSPERCAsmOperand : AsmOperandClass {
-  let Name = "RegSPERC"; let PredicateMethod = "isRegNumber";
-}
-def sperc : RegisterOperand<SPERC> {
-  let ParserMatchClass = PPCRegSPERCAsmOperand;
-}
-def PPCRegSPE4RCAsmOperand : AsmOperandClass {
-  let Name = "RegSPE4RC"; let PredicateMethod = "isRegNumber";
-}
-def spe4rc : RegisterOperand<GPRC> {
-  let ParserMatchClass = PPCRegSPE4RCAsmOperand;
-}
+defm GPRC : PPCRegOperand<"isRegNumber">;
+defm G8RC : PPCRegOperand<"isRegNumber">;
+defm GPRC_NOR0 : PPCRegOperand<"isRegNumber">;
+defm G8RC_NOX0 : PPCRegOperand<"isRegNumber">;
+defm SPERC : PPCRegOperand<"isRegNumber">;
+defm F8RC : PPCRegOperand<"isRegNumber">;
+defm F4RC : PPCRegOperand<"isRegNumber">;
+defm FpRC : PPCRegOperand<"isEvenRegNumber", "fpairrc">;
+defm VRRC : PPCRegOperand<"isRegNumber">;
+defm VSRC : PPCRegOperand<"isVSRegNumber">;
+defm VFRC : PPCRegOperand<"isRegNumber">;
+defm VSFRC : PPCRegOperand<"isVSRegNumber">;
+defm SPILLTOVSRRC : PPCRegOperand<"isVSRegNumber">;
+defm VSSRC : PPCRegOperand<"isVSRegNumber">;
+defm CRBITRC : PPCRegOperand<"isCRBitNumber">;
+defm CRRC : PPCRegOperand<"isCCRegNumber">;
+defm G8pRC : PPCRegOperand<"isEvenRegNumber">;
+
+// Special case: spe4rc uses GPRC register class and its AsmOperandClass
+def spe4rc : PPCRegOperandOnly<"GPRC">;
 
 // Address operands
 // A version of ptr_rc which excludes R0 (or X0 in 64-bit mode).
@@ -617,6 +527,9 @@ def ptr_rc_idx : RegisterOperand<ptr_rc_idx_by_hwmode> {
   let ParserMatchClass = PPCRegGxRCOperand;
 }
 
+include "PPCRegisterInfoMMA.td"
+include "PPCRegisterInfoDMR.td"
+
 //===----------------------------------------------------------------------===//
 // Memory operands (depend on register operands defined above)
 //===----------------------------------------------------------------------===//
@@ -691,107 +604,3 @@ def pred : Operand<OtherVT> {
   let PrintMethod = "printPredicateOperand";
   let MIOperandInfo = (ops i32imm:$bibo, crrc:$reg);
 }
-
-def PPCRegVSRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
-}
-def vsrc : RegisterOperand<VSRC> {
-  let ParserMatchClass = PPCRegVSRCAsmOperand;
-}
-
-def PPCRegVSFRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSFRC"; let PredicateMethod = "isVSRegNumber";
-}
-def vsfrc : RegisterOperand<VSFRC> {
-  let ParserMatchClass = PPCRegVSFRCAsmOperand;
-}
-
-def PPCRegVSSRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSSRC"; let PredicateMethod = "isVSRegNumber";
-}
-def vssrc : RegisterOperand<VSSRC> {
-  let ParserMatchClass = PPCRegVSSRCAsmOperand;
-}
-
-def PPCRegSPILLTOVSRRCAsmOperand : AsmOperandClass {
-  let Name = "RegSPILLTOVSRRC"; let PredicateMethod = "isVSRegNumber";
-}
-
-def spilltovsrrc : RegisterOperand<SPILLTOVSRRC> {
-  let ParserMatchClass = PPCRegSPILLTOVSRRCAsmOperand;
-}
-
-def PPCRegVSRpRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSRpRC"; let PredicateMethod = "isVSRpEvenRegNumber";
-}
-
-def vsrprc : RegisterOperand<VSRpRC> {
-  let ParserMatchClass = PPCRegVSRpRCAsmOperand;
-}
-
-def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass {
-  let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber";
-}
-
-def vsrpevenrc : RegisterOperand<VSRpRC> {
-  let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand;
-  let EncoderMethod = "getVSRpEvenEncoding";
-  let DecoderMethod = "decodeVSRpEvenOperands";
-}
-
-def PPCRegACCRCAsmOperand : AsmOperandClass {
-  let Name = "RegACCRC"; let PredicateMethod = "isACCRegNumber";
-}
-
-def acc : RegisterOperand<ACCRC> {
-  let ParserMatchClass = PPCRegACCRCAsmOperand;
-}
-
-def uacc : RegisterOperand<UACCRC> {
-  let ParserMatchClass = PPCRegACCRCAsmOperand;
-}
-
-// DMR Register Operands
-def PPCRegDMRROWRCAsmOperand : AsmOperandClass {
-  let Name = "RegDMRROWRC";
-  let PredicateMethod = "isDMRROWRegNumber";
-}
-
-def dmrrow : RegisterOperand<DMRROWRC> {
-  let ParserMatchClass = PPCRegDMRROWRCAsmOperand;
-}
-
-def PPCRegDMRROWpRCAsmOperand : AsmOperandClass {
-  let Name = "RegDMRROWpRC";
-  let PredicateMethod = "isDMRROWpRegNumber";
-}
-
-def dmrrowp : RegisterOperand<DMRROWpRC> {
-  let ParserMatchClass = PPCRegDMRROWpRCAsmOperand;
-}
-
-def wacc : RegisterOperand<WACCRC> {
-  let ParserMatchClass = PPCRegACCRCAsmOperand;
-}
-
-def wacc_hi : RegisterOperand<WACC_HIRC> {
-  let ParserMatchClass = PPCRegACCRCAsmOperand;
-}
-
-def PPCRegDMRRCAsmOperand : AsmOperandClass {
-  let Name = "RegDMRRC";
-  let PredicateMethod = "isDMRRegNumber";
-}
-
-def dmr : RegisterOperand<DMRRC> {
-  let ParserMatchClass = PPCRegDMRRCAsmOperand;
-}
-
-def PPCRegDMRpRCAsmOperand : AsmOperandClass {
-  let Name = "RegDMRpRC";
-  let PredicateMethod = "isDMRpRegNumber";
-}
-
-def dmrp : RegisterOperand<DMRpRC> {
-  let ParserMatchClass = PPCRegDMRpRCAsmOperand;
-}
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfoDMR.td b/llvm/lib/Target/PowerPC/PPCRegisterInfoDMR.td
index 1c3e7621825bd..1b8ecbf043142 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfoDMR.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfoDMR.td
@@ -133,32 +133,35 @@ let SubRegIndices = [sub_dmr0, sub_dmr1] in {
   def DMRp3 : DMRp<3, "dmrp3", [DMR6, DMR7]>, DwarfRegNum<[-1, -1]>;
 }
 
-def DMRROWRC :  RegisterClass<"PPC", [v128i1], 128,
-                               (add (sequence "DMRROW%u", 0, 63))> {
-  let Size = 128;
-}
+def DMRROWRC : PPCRegisterClassWithSize<[v128i1], 128,
+                 (add (sequence "DMRROW%u", 0, 63)), 128>;
 
-def DMRROWpRC :  RegisterClass<"PPC", [v256i1], 128,
-                               (add (sequence "DMRROWp%u", 0, 31))> {
-  let Size = 256;
-}
+def DMRROWpRC : PPCRegisterClassWithSize<[v256i1], 128,
+                  (add (sequence "DMRROWp%u", 0, 31)), 256>;
 
-def WACCRC : RegisterClass<"PPC", [v512i1], 128,
-                           (add (sequence "WACC%u", 0, 7))> {
-  let Size = 512;
-}
+def WACCRC : PPCRegisterClassWithSize<[v512i1], 128,
+               (add (sequence "WACC%u", 0, 7)), 512>;
 
-def WACC_HIRC : RegisterClass<"PPC", [v512i1], 128,
-                              (add (sequence "WACC_HI%u", 0, 7))> {
-  let Size = 512;
-}
+def WACC_HIRC : PPCRegisterClassWithSize<[v512i1], 128,
+                  (add (sequence "WACC_HI%u", 0, 7)), 512>;
 
-def DMRRC : RegisterClass<"PPC", [v1024i1], 128,
-                           (add (sequence "DMR%u", 0, 7))> {
-  let Size = 1024;
-}
+def DMRRC : PPCRegisterClassWithSize<[v1024i1], 128,
+              (add (sequence "DMR%u", 0, 7)), 1024>;
 
-def DMRpRC : RegisterClass<"PPC", [v2048i1], 128,
-                           (add DMRp0, DMRp1, DMRp2, DMRp3)> {
-  let Size = 2048;
+def DMRpRC : PPCRegisterClassWithSize<[v2048i1], 128,
+               (add DMRp0, DMRp1, DMRp2, DMRp3), 2048>;
+
+// Register Operand Definitions.
+
+defm DMRROWRC : PPCRegOperand<"isDMRROWRegNumber", "dmrrow">;
+defm DMRROWpRC : PPCRegOperand<"isDMRROWpRegNumber", "dmrrowp">;
+defm DMRRC : PPCRegOperand<"isDMRRegNumber", "dmr">;
+defm DMRpRC : PPCRegOperand<"isDMRpRegNumber", "dmrp">;               
+
+// ACC Register Operands share the same AsmOperandClass.
+def wacc : RegisterOperand<WACCRC> {
+  let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
+def wacc_hi : RegisterOperand<WACC_HIRC> {
+  let ParserMatchClass = PPCRegACCRCAsmOperand;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td b/llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td
index 57f4545516a00..e4d773e92051b 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfoMMA.td
@@ -45,20 +45,15 @@ let SubRegIndices = [sub_pair0, sub_pair1] in {
   def ACC6 : ACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
   def ACC7 : ACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
 }
-def ACCRC : RegisterClass<"PPC", [v512i1], 128, (add ACC0, ACC1, ACC2, ACC3,
-                                                      ACC4, ACC5, ACC6, ACC7)> {
-  // The AllocationPriority is in the range [0, 31]. Assigned the ACC registers
-  // the highest possible priority in this range to force the register allocator
-  // to assign these registers first. This is done because the ACC registers
-  // must represent 4 advacent vector registers. For example ACC1 must be
-  // VS4 - VS7.
-  let AllocationPriority = 31;
-
-  // We want to allocate these registers even before we allocate
-  // global ranges.
-  let GlobalPriority = true;
-  let Size = 512;
-}
+// The AllocationPriority is in the range [0, 31]. Assigned the ACC registers
+// the highest possible priority in this range to force the register allocator
+// to assign these registers first. This is done because the ACC registers
+// must represent 4 advacent vector registers. For example ACC1 must be
+// VS4 - VS7.
+// We want to allocate these registers even before we allocate global ranges.
+def ACCRC : PPCRegisterClassWithPriority<[v512i1], 128,
+              (add ACC0, ACC1, ACC2, ACC3, ACC4, ACC5, ACC6, ACC7),
+              31, 1, 512>;
 
 let SubRegIndices = [sub_pair0, sub_pair1] in {
   def UACC0 : UACC<0, "acc0", [VSRp0, VSRp1]>, DwarfRegNum<[-1, -1]>;
@@ -70,17 +65,14 @@ let SubRegIndices = [sub_pair0, sub_pair1] in {
   def UACC6 : UACC<6, "acc6", [VSRp12, VSRp13]>, DwarfRegNum<[-1, -1]>;
   def UACC7 : UACC<7, "acc7", [VSRp14, VSRp15]>, DwarfRegNum<[-1, -1]>;
 }
-def UACCRC : RegisterClass<"PPC", [v512i1], 128,
-                           (add UACC0, UACC1, UACC2, UACC3,
-                                UACC4, UACC5, UACC6, UACC7)> {
-  // The AllocationPriority for the UACC registers is still high and must be at
-  // least 32 as we want to allocate these registers before we allocate other
-  // global ranges. The value must be less than the AllocationPriority of the
-  // ACC registers.
-  let AllocationPriority = 4;
-  let GlobalPriority = true;
-  let Size = 512;
-}
+// The AllocationPriority for the UACC registers is still high and must be at
+// least 32 as we want to allocate these registers before we allocate other
+// global ranges. The value must be less than the AllocationPriority of the
+// ACC registers.
+def UACCRC : PPCRegisterClassWithPriority<[v512i1], 128,
+               (add UACC0, UACC1, UACC2, UACC3,
+                    UACC4, UACC5, UACC6, UACC7),
+               4, 1, 512>;
 
 // FIXME: This allocation order may increase stack frame size when allocating
 // non-volatile registers.
@@ -89,22 +81,35 @@ def UACCRC : RegisterClass<"PPC", [v512i1], 128,
 // ones, to reduce interference with accumulator registers (lower 32 VSRs).
 // This reduces copies when loading for accumulators, which is common use for
 // paired VSX registers.
-def VSRpRC :
-  RegisterClass<"PPC", [v256i1], 128,
-                (add VSRp17, VSRp18, VSRp16, VSRp19, VSRp20, VSRp21,
-                     VSRp22, VSRp23, VSRp24, VSRp25, VSRp31, VSRp30,
-                     VSRp29, VSRp28, VSRp27, VSRp26,
-                     (sequence "VSRp%u", 0, 6),
-                     (sequence "VSRp%u", 15, 7))> {
-  // Give the VSRp registers a non-zero AllocationPriority. The value is less
-  // than 32 as these registers should not always be allocated before global
-  // ranges and the value should be less than the AllocationPriority - 32 for
-  // the UACC registers. Even global VSRp registers should be allocated after
-  // the UACC registers have been chosen.
-  let AllocationPriority = 2;
-  let Size = 256;
-}
+// Give the VSRp registers a non-zero AllocationPriority. The value is less
+// than 32 as these registers should not always be allocated before global
+// ranges and the value should be less than the AllocationPriority - 32 for
+// the UACC registers. Even global VSRp registers should be allocated after
+// the UACC registers have been chosen.
+def VSRpRC : PPCRegisterClassWithPriority<[v256i1], 128,
+               (add VSRp17, VSRp18, VSRp16, VSRp19, VSRp20, VSRp21,
+                    VSRp22, VSRp23, VSRp24, VSRp25, VSRp31, VSRp30,
+                    VSRp29, VSRp28, VSRp27, VSRp26,
+                    (sequence "VSRp%u", 0, 6),
+                    (sequence "VSRp%u", 15, 7)),
+               2, 0, 256>;
 
 
+// Register Operand Definitions.
 
+// ACC Register Operands share the same AsmOperandClass.
+defm ACCRC : PPCRegOperand<"isACCRegNumber", "acc">;
+def uacc : RegisterOperand<UACCRC> {
+  let ParserMatchClass = PPCRegACCRCAsmOperand;
+}
 
+defm VSRpRC : PPCRegOperand<"isVSRpEvenRegNumber">;
+// VSRpRC with encoder/decoder methods (uses VSRpRC class but different operand name)
+def PPCRegVSRpEvenRCAsmOperand : AsmOperandClass {
+  let Name = "RegVSRpEvenRC"; let PredicateMethod = "isVSRpEvenRegNumber";
+}
+def vsrpevenrc : RegisterOperand<VSRpRC> {
+  let ParserMatchClass = PPCRegVSRpEvenRCAsmOperand;
+  let EncoderMethod = "getVSRpEvenEncoding";
+  let DecoderMethod = "decodeVSRpEvenOperands";
+}
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP10.td b/llvm/lib/Target/PowerPC/PPCScheduleP10.td
index f922f8a7d9852..09b0affb719d8 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP10.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP10.td
@@ -30,7 +30,7 @@ def P10Model : SchedMachineModel {
   let CompleteModel = 1;
 
   // Power 10 does not support instructions from SPE, Book E and HTM.
-  let UnsupportedFeatures = [HasSPE, IsE500, IsBookE, IsISAFuture, HasHTM];
+  let UnsupportedFeatures = [HasSPE, IsE500, IsBookE, IsISAFuture, HasFutureVector, HasHTM];
 }
 
 let SchedModel = P10Model in {
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index 36befceef4ac1..306bbf8e4e276 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -43,7 +43,8 @@ def P9Model : SchedMachineModel {
   // instructions introduced after ISA 3.0.
   let UnsupportedFeatures = [HasSPE, PrefixInstrs, MMA,
                              PairedVectorMemops, IsBookE,
-                             PCRelativeMemops, IsISA3_1, IsISAFuture];
+                             PCRelativeMemops, IsISA3_1, IsISAFuture,
+                             HasFutureVector];
 }
 
 let SchedModel = P9Model in {
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 76ecd4fccfd85..cd3dcfd0aaaf1 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1443,6 +1443,38 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     ReplaceNode(Node, SRAI);
     return;
   }
+  case ISD::SIGN_EXTEND_INREG: {
+    // Optimize (sext_inreg (srl X, C), i8/i16) ->
+    //          (srai (slli X, XLen-ExtSize-C), XLen-ExtSize)
+    // This is a bitfield extract pattern where we're extracting a signed
+    // 8-bit or 16-bit field from position C.
+    SDValue N0 = Node->getOperand(0);
+    if (N0.getOpcode() != ISD::SRL || !N0.hasOneUse())
+      break;
+
+    auto *ShAmtC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!ShAmtC)
+      break;
+
+    unsigned ExtSize =
+        cast<VTSDNode>(Node->getOperand(1))->getVT().getSizeInBits();
+    unsigned ShAmt = ShAmtC->getZExtValue();
+    unsigned XLen = Subtarget->getXLen();
+
+    // Only handle types less than 32, and make sure the shift amount is valid.
+    if (ExtSize >= 32 || ShAmt >= XLen - ExtSize)
+      break;
+
+    unsigned LShAmt = XLen - ExtSize - ShAmt;
+    SDNode *SLLI =
+        CurDAG->getMachineNode(RISCV::SLLI, DL, VT, N0.getOperand(0),
+                               CurDAG->getTargetConstant(LShAmt, DL, VT));
+    SDNode *SRAI = CurDAG->getMachineNode(
+        RISCV::SRAI, DL, VT, SDValue(SLLI, 0),
+        CurDAG->getTargetConstant(XLen - ExtSize, DL, VT));
+    ReplaceNode(Node, SRAI);
+    return;
+  }
   case ISD::OR: {
     if (tryShrinkShlLogicImm(Node))
       return;
@@ -1842,8 +1874,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   }
   case ISD::SMUL_LOHI:
   case ISD::UMUL_LOHI:
-  case RISCVISD::WMULSU: {
-    // Custom select (S/U)MUL_LOHI to WMUL(U) for RV32P.
+  case RISCVISD::WMULSU:
+  case RISCVISD::WADDU:
+  case RISCVISD::WSUBU: {
     assert(Subtarget->hasStdExtP() && !Subtarget->is64Bit() && VT == MVT::i32 &&
            "Unexpected opcode");
 
@@ -1860,12 +1893,18 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     case RISCVISD::WMULSU:
       Opc = RISCV::WMULSU;
       break;
+    case RISCVISD::WADDU:
+      Opc = RISCV::WADDU;
+      break;
+    case RISCVISD::WSUBU:
+      Opc = RISCV::WSUBU;
+      break;
     }
 
-    SDNode *WMUL = CurDAG->getMachineNode(
+    SDNode *Result = CurDAG->getMachineNode(
         Opc, DL, MVT::Untyped, Node->getOperand(0), Node->getOperand(1));
 
-    auto [Lo, Hi] = extractGPRPair(CurDAG, DL, SDValue(WMUL, 0));
+    auto [Lo, Hi] = extractGPRPair(CurDAG, DL, SDValue(Result, 0));
     ReplaceUses(SDValue(Node, 0), Lo);
     ReplaceUses(SDValue(Node, 1), Hi);
     CurDAG->RemoveDeadNode(Node);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5156145e35aa2..7c1eacbce3701 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -943,8 +943,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                          Expand);
       setOperationAction(ISD::VP_MERGE, VT, Custom);
 
-      setOperationAction({ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF}, VT,
-                         Custom);
+      setOperationAction({ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON,
+                          ISD::VP_CTTZ_ELTS, ISD::VP_CTTZ_ELTS_ZERO_UNDEF},
+                         VT, Custom);
 
       setOperationAction({ISD::VP_AND, ISD::VP_OR, ISD::VP_XOR}, VT, Custom);
 
@@ -1567,6 +1568,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
           setOperationAction(ISD::EXPERIMENTAL_VP_SPLICE, VT, Custom);
           setOperationAction(ISD::EXPERIMENTAL_VP_REVERSE, VT, Custom);
+
+          setOperationAction({ISD::CTTZ_ELTS, ISD::CTTZ_ELTS_ZERO_POISON}, VT,
+                             Custom);
           continue;
         }
 
@@ -1881,8 +1885,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setTargetDAGCombine({ISD::INTRINSIC_VOID, ISD::INTRINSIC_W_CHAIN,
                        ISD::INTRINSIC_WO_CHAIN, ISD::ADD, ISD::SUB, ISD::MUL,
-                       ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT});
-  setTargetDAGCombine(ISD::SRA);
+                       ISD::AND, ISD::OR, ISD::XOR, ISD::SETCC, ISD::SELECT,
+                       ISD::SRA});
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
 
   if (Subtarget.hasStdExtFOrZfinx())
@@ -1902,16 +1906,31 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine({ISD::ZERO_EXTEND, ISD::FP_TO_SINT, ISD::FP_TO_UINT,
                          ISD::FP_TO_SINT_SAT, ISD::FP_TO_UINT_SAT});
   if (Subtarget.hasVInstructions())
-    setTargetDAGCombine(
-        {ISD::FCOPYSIGN,    ISD::MGATHER,      ISD::MSCATTER,
-         ISD::VP_GATHER,    ISD::VP_SCATTER,   ISD::SRA,
-         ISD::SRL,          ISD::SHL,          ISD::STORE,
-         ISD::SPLAT_VECTOR, ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
-         ISD::VP_STORE,     ISD::VP_TRUNCATE,  ISD::EXPERIMENTAL_VP_REVERSE,
-         ISD::MUL,          ISD::SDIV,         ISD::UDIV,
-         ISD::SREM,         ISD::UREM,         ISD::INSERT_VECTOR_ELT,
-         ISD::ABS,          ISD::CTPOP,        ISD::VECTOR_SHUFFLE,
-         ISD::FMA,          ISD::VSELECT,      ISD::VECREDUCE_ADD});
+    setTargetDAGCombine({ISD::FCOPYSIGN,
+                         ISD::MGATHER,
+                         ISD::MSCATTER,
+                         ISD::VP_GATHER,
+                         ISD::VP_SCATTER,
+                         ISD::SRL,
+                         ISD::SHL,
+                         ISD::STORE,
+                         ISD::SPLAT_VECTOR,
+                         ISD::BUILD_VECTOR,
+                         ISD::CONCAT_VECTORS,
+                         ISD::VP_STORE,
+                         ISD::VP_TRUNCATE,
+                         ISD::EXPERIMENTAL_VP_REVERSE,
+                         ISD::SDIV,
+                         ISD::UDIV,
+                         ISD::SREM,
+                         ISD::UREM,
+                         ISD::INSERT_VECTOR_ELT,
+                         ISD::ABS,
+                         ISD::CTPOP,
+                         ISD::VECTOR_SHUFFLE,
+                         ISD::FMA,
+                         ISD::VSELECT,
+                         ISD::VECREDUCE_ADD});
 
   if (Subtarget.hasVendorXTHeadMemPair())
     setTargetDAGCombine({ISD::LOAD, ISD::STORE});
@@ -7874,6 +7893,9 @@ RISCVTargetLowering::lowerXAndesBfHCvtBFloat16Store(SDValue Op,
       ST->getMemOperand());
 }
 
+static SDValue lowerCttzElts(SDValue Op, SelectionDAG &DAG,
+                             const RISCVSubtarget &Subtarget);
+
 SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
                                             SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -9183,6 +9205,9 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
     return lowerPARTIAL_REDUCE_MLA(Op, DAG);
+  case ISD::CTTZ_ELTS:
+  case ISD::CTTZ_ELTS_ZERO_POISON:
+    return lowerCttzElts(Op, DAG, Subtarget);
   }
 }
 
@@ -11518,9 +11543,9 @@ static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res);
 }
 
-static SDValue lowerCttzElts(SDNode *N, SelectionDAG &DAG,
+static SDValue lowerCttzElts(SDValue Op, SelectionDAG &DAG,
                              const RISCVSubtarget &Subtarget) {
-  SDValue Op0 = N->getOperand(1);
+  SDValue Op0 = Op.getOperand(0);
   MVT OpVT = Op0.getSimpleValueType();
   MVT ContainerVT = OpVT;
   if (OpVT.isFixedLengthVector()) {
@@ -11528,10 +11553,10 @@ static SDValue lowerCttzElts(SDNode *N, SelectionDAG &DAG,
     Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget);
   }
   MVT XLenVT = Subtarget.getXLenVT();
-  SDLoc DL(N);
+  SDLoc DL(Op);
   auto [Mask, VL] = getDefaultVLOps(OpVT, ContainerVT, DL, DAG, Subtarget);
   SDValue Res = DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Op0, Mask, VL);
-  if (isOneConstant(N->getOperand(2)))
+  if (Op.getOpcode() == ISD::CTTZ_ELTS_ZERO_POISON)
     return Res;
 
   // Convert -1 to VL.
@@ -11686,8 +11711,6 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::experimental_get_vector_length:
     return lowerGetVectorLength(Op.getNode(), DAG, Subtarget);
-  case Intrinsic::experimental_cttz_elts:
-    return lowerCttzElts(Op.getNode(), DAG, Subtarget);
   case Intrinsic::riscv_vmv_x_s: {
     SDValue Res = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Op.getOperand(1));
     return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
@@ -15888,11 +15911,6 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
       return;
     }
-    case Intrinsic::experimental_cttz_elts: {
-      SDValue Res = lowerCttzElts(N, DAG, Subtarget);
-      Results.push_back(DAG.getZExtOrTrunc(Res, DL, N->getValueType(0)));
-      return;
-    }
     case Intrinsic::riscv_orc_b:
     case Intrinsic::riscv_brev8:
     case Intrinsic::riscv_sha256sig0:
@@ -21613,6 +21631,27 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue Op1 = N->getOperand(2);
     SDValue Op2 = N->getOperand(3);
 
+    // (WADDAU lo, 0, rs1, 0) -> (WADDU lo, rs1)
+    if (isNullConstant(Op0Hi) && isNullConstant(Op2)) {
+      SDValue Result = DAG.getNode(
+          RISCVISD::WADDU, DL, DAG.getVTList(MVT::i32, MVT::i32), Op0Lo, Op1);
+      return DCI.CombineTo(N, Result.getValue(0), Result.getValue(1));
+    }
+
+    // (WADDAU -C, -1, rs1, 0) -> (WSUBU rs1, C) where C > 0
+    if (isNullConstant(Op2) && isAllOnesConstant(Op0Hi)) {
+      if (auto *C0 = dyn_cast<ConstantSDNode>(Op0Lo)) {
+        int64_t Val = C0->getSExtValue();
+        if (Val < 0) {
+          SDValue PosConst = DAG.getConstant(-Val, DL, MVT::i32);
+          SDValue Result =
+              DAG.getNode(RISCVISD::WSUBU, DL,
+                          DAG.getVTList(MVT::i32, MVT::i32), Op1, PosConst);
+          return DCI.CombineTo(N, Result.getValue(0), Result.getValue(1));
+        }
+      }
+    }
+
     // FIXME: Canonicalize zero Op1 to Op2.
     if (isNullConstant(Op2) && Op0Lo.getNode() == Op0Hi.getNode() &&
         Op0Lo.getResNo() == 0 && Op0Hi.getResNo() == 1 && Op0Lo.hasOneUse() &&
@@ -21644,6 +21683,13 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
     SDValue Op1 = N->getOperand(2);
     SDValue Op2 = N->getOperand(3);
 
+    // (WSUBAU lo, 0, 0, rs2) -> (WSUBU lo, rs2)
+    if (isNullConstant(Op0Hi) && isNullConstant(Op1)) {
+      SDValue Result = DAG.getNode(
+          RISCVISD::WSUBU, DL, DAG.getVTList(MVT::i32, MVT::i32), Op0Lo, Op2);
+      return DCI.CombineTo(N, Result.getValue(0), Result.getValue(1));
+    }
+
     // (WSUBAU (WADDAU lo, hi, a, 0), 0, b) -> (WSUBAU lo, hi, a, b)
     if (isNullConstant(Op1) && Op0Lo.getOpcode() == RISCVISD::WADDAU &&
         Op0Lo.getNode() == Op0Hi.getNode() && Op0Lo.getResNo() == 0 &&
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 23950c4478a1b..788ecda2a1df1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1631,6 +1631,9 @@ def riscv_waddau : RVSDNode<"WADDAU", SDT_RISCVWideningAddSubAccumulate>;
 // Widening sub accumulate unsigned: rd = rd + zext(rs1) - zext(rs2)
 def riscv_wsubau : RVSDNode<"WSUBAU", SDT_RISCVWideningAddSubAccumulate>;
 
+def riscv_waddu : RVSDNode<"WADDU", SDTIntBinHiLoOp, [SDNPCommutative]>;
+def riscv_wsubu : RVSDNode<"WSUBU", SDTIntBinHiLoOp>;
+
 def riscv_wmulsu : RVSDNode<"WMULSU", SDTIntBinHiLoOp>;
 
 def SDT_RISCVWideningShiftLeft : SDTypeProfile<2, 2, [SDTCisVT<0, i32>,
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index 74b1fad703989..99dcc05db105a 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -350,6 +350,100 @@ def SIFIVE_X390 : RISCVProcessorModel<"sifive-x390",
   let ConfigurableTuneFeatures = [TuneHasSingleElementVecFP64];
 }
 
+def SIFIVE_X160 : RISCVProcessorModel<"sifive-x160",
+                                      NoSchedModel,
+                                      [Feature32Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtC,
+                                       FeatureStdExtB,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZicond,
+                                       FeatureStdExtZifencei,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZimop,
+                                       FeatureStdExtZawrs,
+                                       FeatureStdExtZfa,
+                                       FeatureStdExtZfbfmin,
+                                       FeatureStdExtZfh,
+                                       FeatureStdExtZcb,
+                                       FeatureStdExtZce,
+                                       FeatureStdExtZcf,
+                                       FeatureStdExtZcmop,
+                                       FeatureStdExtZcmp,
+                                       FeatureStdExtZcmt,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtZvbb,
+                                       FeatureStdExtZve32f,
+                                       FeatureStdExtZve32x,
+                                       FeatureStdExtZvfbfmin,
+                                       FeatureStdExtZvfbfwma,
+                                       FeatureStdExtZvfh,
+                                       FeatureStdExtZvkt,
+                                       FeatureStdExtZvl128b,
+                                       FeatureVendorXSfcease,
+                                       FeatureStdExtZicfilp,
+                                       FeatureStdExtZvdot4a8i,
+                                       FeatureStdExtZvfbfa],
+                                       SiFiveIntelligenceTuneFeatures>;
+
+def SIFIVE_X180 : RISCVProcessorModel<"sifive-x180",
+                                      NoSchedModel,
+                                      [Feature64Bit,
+                                       FeatureStdExtI,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtB,
+                                       FeatureStdExtV,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZicond,
+                                       FeatureStdExtZifencei,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZimop,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZawrs,
+                                       FeatureStdExtZfa,
+                                       FeatureStdExtZfbfmin,
+                                       FeatureStdExtZfh,
+                                       FeatureStdExtZcb,
+                                       FeatureStdExtZcd,
+                                       FeatureStdExtZcmop,
+                                       FeatureStdExtZkt,
+                                       FeatureStdExtZvbb,
+                                       FeatureStdExtZvfbfmin,
+                                       FeatureStdExtZvfbfwma,
+                                       FeatureStdExtZvfh,
+                                       FeatureStdExtZvkb,
+                                       FeatureStdExtZvkt,
+                                       FeatureStdExtZvl128b,
+                                       FeatureVendorXSfcease,
+                                       FeatureVendorXSfvfbfexp16e,
+                                       FeatureVendorXSfvfexp16e,
+                                       FeatureVendorXSfvfexp32e,
+                                       FeatureVendorXSfvfexpa,
+                                       FeatureVendorXSiFivecflushdlone,
+                                       FeatureStdExtZicfilp,
+                                       FeatureStdExtZvdot4a8i,
+                                       FeatureStdExtZvfbfa],
+                                       SiFiveIntelligenceTuneFeatures>;
+
 defvar SiFiveP400TuneFeatures = [TuneNoDefaultUnroll,
                                  TuneConditionalCompressedMoveFusion,
                                  TuneLUIADDIFusion,
@@ -811,8 +905,12 @@ def SPACEMIT_X60 : RISCVProcessorModel<"spacemit-x60",
 
 def SPACEMIT_X100 : RISCVProcessorModel<"spacemit-x100",
                                         SpacemitX100Model,
-                                        !listconcat(RVA23U64Features,
-                                        [FeatureStdExtZbc,
+                                        !listconcat(RVA23S64Features,
+                                        [FeatureStdExtSmepmp,
+                                         FeatureStdExtSmnpm,
+                                         FeatureStdExtSmstateen,
+                                         FeatureStdExtSspm,
+                                         FeatureStdExtZbc,
                                          FeatureStdExtZbkc,
                                          FeatureStdExtZfbfmin,
                                          FeatureStdExtZfh,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 523507cd174f3..54b286d128dfa 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -600,7 +600,7 @@ void RISCVPassConfig::addPreEmitPass2() {
   addPass(createRISCVExpandAtomicPseudoPass());
 
   // KCFI indirect call checks are lowered to a bundle.
-  addPass(createUnpackMachineBundles([&](const MachineFunction &MF) {
+  addPass(createUnpackMachineBundlesLegacy([&](const MachineFunction &MF) {
     return MF.getFunction().getParent()->getModuleFlag("kcfi");
   }));
 
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 66500f5626fd1..6a3bc39b8e73b 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -51,10 +51,22 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
   const unsigned NumVarOps = MI->getNumOperands() - StartIndex;
 
   if (MI->getOpcode() == SPIRV::OpConstantI && NumVarOps > 2) {
+    // Look up the bitwidth of this int type register from
+    // IntTypeBitwidths map.
+    MCRegister IntTypeReg = MI->getOperand(1).getReg();
+    unsigned Bitwidth = IntTypeBitwidths.at(IntTypeReg);
+
     // SPV_ALTERA_arbitrary_precision_integers allows for integer widths greater
     // than 64, which will be encoded via multiple operands.
-    for (unsigned I = StartIndex; I != MI->getNumOperands(); ++I)
-      O << ' ' << MI->getOperand(I).getImm();
+    const unsigned TotalBits = NumVarOps * 32;
+    APInt Val(TotalBits, 0);
+    for (unsigned i = 0; i < NumVarOps; ++i) {
+      uint64_t Word = MI->getOperand(StartIndex + i).getImm();
+      Val |= APInt(TotalBits, Word) << (i * 32);
+    }
+    APInt ActualVal = Val.trunc(Bitwidth);
+    O << ' ';
+    ActualVal.print(O, /*isSigned=*/false);
     return;
   }
 
@@ -101,6 +113,12 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
   O << Imm;
 }
 
+void SPIRVInstPrinter::recordIntType(const MCInst *MI) {
+  MCRegister IntTypeReg = MI->getOperand(0).getReg();
+  unsigned Bitwidth = MI->getOperand(1).getImm();
+  IntTypeBitwidths[IntTypeReg] = Bitwidth;
+}
+
 void SPIRVInstPrinter::recordOpExtInstImport(const MCInst *MI) {
   MCRegister Reg = MI->getOperand(0).getReg();
   auto Name = getSPIRVStringOperand(*MI, 1);
@@ -113,6 +131,9 @@ void SPIRVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                  raw_ostream &OS) {
   const unsigned OpCode = MI->getOpcode();
   printInstruction(MI, Address, OS);
+  if (OpCode == SPIRV::OpTypeInt) {
+    recordIntType(MI);
+  }
 
   if (OpCode == SPIRV::OpDecorate) {
     printOpDecorate(MI, OS);
diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
index 8f2ad48efa9d7..13d69c0dcd042 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.h
@@ -22,7 +22,9 @@ namespace llvm {
 class SPIRVInstPrinter : public MCInstPrinter {
 private:
   SmallDenseMap<MCRegister, SPIRV::InstructionSet::InstructionSet> ExtInstSetIDs;
+  SmallDenseMap<MCRegister, unsigned> IntTypeBitwidths;
   void recordOpExtInstImport(const MCInst *MI);
+  void recordIntType(const MCInst *MI);
 
 public:
   using MCInstPrinter::MCInstPrinter;
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 87bb871dda80c..131b56e92b8be 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -255,6 +255,7 @@ class SPIRVEmitIntrinsics
   bool shouldTryToAddMemAliasingDecoration(Instruction *Inst);
   void insertSpirvDecorations(Instruction *I, IRBuilder<> &B);
   void insertConstantsForFPFastMathDefault(Module &M);
+  Value *buildSpvUndefComposite(Type *AggrTy, IRBuilder<> &B);
   void processGlobalValue(GlobalVariable &GV, IRBuilder<> &B);
   void processParamTypes(Function *F, IRBuilder<> &B);
   void processParamTypesByFunHeader(Function *F, IRBuilder<> &B);
@@ -422,9 +423,11 @@ static bool isMemInstrToReplace(Instruction *I) {
 }
 
 static bool isAggrConstForceInt32(const Value *V) {
+  bool IsAggrZero =
+      isa<ConstantAggregateZero>(V) && !V->getType()->isVectorTy();
+  bool IsUndefAggregate = isa<UndefValue>(V) && V->getType()->isAggregateType();
   return isa<ConstantArray>(V) || isa<ConstantStruct>(V) ||
-         isa<ConstantDataArray>(V) ||
-         (isa<ConstantAggregateZero>(V) && !V->getType()->isVectorTy());
+         isa<ConstantDataArray>(V) || IsAggrZero || IsUndefAggregate;
 }
 
 static void setInsertPointSkippingPhis(IRBuilder<> &B, Instruction *I) {
@@ -2269,6 +2272,36 @@ shouldEmitIntrinsicsForGlobalValue(const GlobalVariableUsers &GVUsers,
   return F == &FirstDefinition;
 }
 
+Value *SPIRVEmitIntrinsics::buildSpvUndefComposite(Type *AggrTy,
+                                                   IRBuilder<> &B) {
+  SmallVector<Value *, 4> Elems;
+  if (auto *ArrTy = dyn_cast<ArrayType>(AggrTy)) {
+    Type *ElemTy = ArrTy->getElementType();
+    auto *UI = B.CreateIntrinsic(Intrinsic::spv_undef, {});
+    AggrConsts[UI] = PoisonValue::get(ElemTy);
+    AggrConstTypes[UI] = ElemTy;
+    Elems.assign(ArrTy->getNumElements(), UI);
+  } else {
+    auto *StructTy = cast<StructType>(AggrTy);
+    DenseMap<Type *, Instruction *> UndefByType;
+    for (unsigned I = 0; I < StructTy->getNumElements(); ++I) {
+      Type *ElemTy = StructTy->getContainedType(I);
+      auto &Entry = UndefByType[ElemTy];
+      if (!Entry) {
+        Entry = B.CreateIntrinsic(Intrinsic::spv_undef, {});
+        AggrConsts[Entry] = PoisonValue::get(ElemTy);
+        AggrConstTypes[Entry] = ElemTy;
+      }
+      Elems.push_back(Entry);
+    }
+  }
+  auto *Composite = B.CreateIntrinsic(Intrinsic::spv_const_composite,
+                                      {B.getInt32Ty()}, Elems);
+  AggrConsts[Composite] = PoisonValue::get(AggrTy);
+  AggrConstTypes[Composite] = AggrTy;
+  return Composite;
+}
+
 void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
                                              IRBuilder<> &B) {
 
@@ -2282,11 +2315,14 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
     // by llvm IR general logic.
     deduceElementTypeHelper(&GV, false);
     Init = GV.getInitializer();
+    Value *InitOp = Init;
+    if (isa<UndefValue>(Init) && Init->getType()->isAggregateType())
+      InitOp = buildSpvUndefComposite(Init->getType(), B);
     Type *Ty = isAggrConstForceInt32(Init) ? B.getInt32Ty() : Init->getType();
     Constant *Const = isAggrConstForceInt32(Init) ? B.getInt32(1) : Init;
     auto *InitInst = B.CreateIntrinsic(Intrinsic::spv_init_global,
                                        {GV.getType(), Ty}, {&GV, Const});
-    InitInst->setArgOperand(1, Init);
+    InitInst->setArgOperand(1, InitOp);
   }
   if (!Init && GV.use_empty())
     B.CreateIntrinsic(Intrinsic::spv_unref_global, GV.getType(), &GV);
@@ -3306,7 +3342,7 @@ bool SPIRVEmitIntrinsics::processMaskedMemIntrinsic(IntrinsicInst &I) {
     Value *Mask = I.getArgOperand(1);
     Value *Passthru = I.getArgOperand(2);
 
-    // Alignment is stored as a parameter attribute, not as a regular parameter
+    // Alignment is stored as a parameter attribute, not as a regular parameter.
     uint32_t Alignment = I.getParamAlign(0).valueOrOne().value();
 
     SmallVector<Value *, 4> Args = {Ptrs, B.getInt32(Alignment), Mask,
@@ -3338,7 +3374,7 @@ bool SPIRVEmitIntrinsics::processMaskedMemIntrinsic(IntrinsicInst &I) {
     Value *Mask = I.getArgOperand(2);
 
     // Alignment is stored as a parameter attribute on the ptrs parameter (arg
-    // 1)
+    // 1).
     uint32_t Alignment = I.getParamAlign(1).valueOrOne().value();
 
     SmallVector<Value *, 4> Args = {Values, Ptrs, B.getInt32(Alignment), Mask};
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index cf4ab00a4f3b3..c04daffa4bacf 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -316,7 +316,6 @@ SPIRVTypeInst
 SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems, SPIRVTypeInst ElemType,
                                      MachineIRBuilder &MIRBuilder) {
   auto EleOpc = ElemType->getOpcode();
-  (void)EleOpc;
   assert(NumElems >= 2 && "SPIR-V OpTypeVector requires at least 2 components");
 
   if (EleOpc == SPIRV::OpTypePointer) {
@@ -1122,6 +1121,11 @@ SPIRVTypeInst SPIRVGlobalRegistry::findSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
     SPIRV::AccessQualifier::AccessQualifier AccQual,
     bool ExplicitLayoutRequired, bool EmitIR) {
+  // Treat <1 x T> as T.
+  if (auto *FVT = dyn_cast<FixedVectorType>(Ty);
+      FVT && FVT->getNumElements() == 1)
+    return findSPIRVType(FVT->getElementType(), MIRBuilder, AccQual,
+                         ExplicitLayoutRequired, EmitIR);
   Ty = adjustIntTypeByWidth(Ty);
   // TODO: findMI needs to know if a layout is required.
   if (const MachineInstr *MI =
@@ -1943,6 +1947,9 @@ SPIRVTypeInst SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
 SPIRVTypeInst SPIRVGlobalRegistry::getOrCreateSPIRVVectorType(
     SPIRVTypeInst BaseType, unsigned NumElements, MachineInstr &I,
     const SPIRVInstrInfo &TII) {
+  // At this point of time all 1-element vectors are resolved. Add assertion
+  // to fire if anything changes.
+  assert(NumElements >= 2 && "SPIR-V vectors must have at least 2 components");
   Type *Ty = FixedVectorType::get(
       const_cast<Type *>(getTypeForSPIRVType(BaseType)), NumElements);
   if (const MachineInstr *MI = findMI(Ty, false, CurMF))
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 7cf039e03430a..29beb0c3c3653 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -553,11 +553,14 @@ static bool isConstReg(MachineRegisterInfo *MRI, MachineInstr *OpDef) {
     switch (MI->getOpcode()) {
     case TargetOpcode::G_INTRINSIC:
     case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
-    case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
-      if (cast<GIntrinsic>(*OpDef).getIntrinsicID() !=
-          Intrinsic::spv_const_composite)
+    case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
+      GIntrinsic *GIntr = cast<GIntrinsic>(MI);
+      unsigned IntrID = GIntr->getIntrinsicID();
+      if (IntrID != Intrinsic::spv_const_composite &&
+          IntrID != Intrinsic::spv_undef)
         return false;
       continue;
+    }
     case TargetOpcode::G_BUILD_VECTOR:
     case TargetOpcode::G_SPLAT_VECTOR:
       for (unsigned i = OpDef->getNumExplicitDefs();
@@ -1760,11 +1763,11 @@ bool SPIRVInstructionSelector::selectMaskedGather(Register ResVReg,
   // 3: alignment (i32 immediate)
   // 4: mask (vector of i1)
   // 5: passthru/fill value
-  Register PtrsReg = I.getOperand(2).getReg();
-  uint32_t Alignment = I.getOperand(3).getImm();
-  Register MaskReg = I.getOperand(4).getReg();
-  Register PassthruReg = I.getOperand(5).getReg();
-  Register AlignmentReg = buildI32Constant(Alignment, I);
+  const Register PtrsReg = I.getOperand(2).getReg();
+  const uint32_t Alignment = I.getOperand(3).getImm();
+  const Register MaskReg = I.getOperand(4).getReg();
+  const Register PassthruReg = I.getOperand(5).getReg();
+  const Register AlignmentReg = buildI32Constant(Alignment, I);
 
   MachineBasicBlock &BB = *I.getParent();
   auto MIB =
@@ -1787,11 +1790,11 @@ bool SPIRVInstructionSelector::selectMaskedScatter(MachineInstr &I) const {
   // 2: vector of pointers
   // 3: alignment (i32 immediate)
   // 4: mask (vector of i1)
-  Register ValuesReg = I.getOperand(1).getReg();
-  Register PtrsReg = I.getOperand(2).getReg();
-  uint32_t Alignment = I.getOperand(3).getImm();
-  Register MaskReg = I.getOperand(4).getReg();
-  Register AlignmentReg = buildI32Constant(Alignment, I);
+  const Register ValuesReg = I.getOperand(1).getReg();
+  const Register PtrsReg = I.getOperand(2).getReg();
+  const uint32_t Alignment = I.getOperand(3).getImm();
+  const Register MaskReg = I.getOperand(4).getReg();
+  const Register AlignmentReg = buildI32Constant(Alignment, I);
   MachineBasicBlock &BB = *I.getParent();
 
   auto MIB =
@@ -6015,6 +6018,10 @@ bool SPIRVInstructionSelector::loadHandleBeforePosition(
     SC = GR.getPointerStorageClass(ResType);
   }
 
+  if (ResType->getOpcode() == SPIRV::OpTypeImage && ArraySize == 0)
+    MIRBuilder.buildInstr(SPIRV::OpCapability)
+        .addImm(SPIRV::Capability::RuntimeDescriptorArrayEXT);
+
   Register VarReg =
       buildPointerToResource(SPIRVTypeInst(VarType), SC, Set, Binding,
                              ArraySize, IndexReg, Name, MIRBuilder);
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index c1bfd986bede7..925b1b00336b5 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -165,20 +165,66 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     buildAssignType(B, ElementType, LI);
     return LI;
   }
+  Value *
+  buildVectorFromLoadedElements(IRBuilder<> &B, FixedVectorType *TargetType,
+                                SmallVector<Value *, 4> &LoadedElements) {
+    // Build the vector from the loaded elements.
+    Value *NewVector = PoisonValue::get(TargetType);
+    buildAssignType(B, TargetType, NewVector);
+
+    for (unsigned I = 0; I < TargetType->getNumElements(); ++I) {
+      Value *Index = B.getInt32(I);
+      SmallVector<Type *, 4> Types = {TargetType, TargetType,
+                                      TargetType->getElementType(),
+                                      Index->getType()};
+      SmallVector<Value *> Args = {NewVector, LoadedElements[I], Index};
+      NewVector = B.CreateIntrinsic(Intrinsic::spv_insertelt, {Types}, {Args});
+      buildAssignType(B, TargetType, NewVector);
+    }
+    return NewVector;
+  }
 
+  // Loads elements from a matrix with an array of vector memory layout and
+  // constructs a vector.
+  Value *loadVectorFromMatrixArray(IRBuilder<> &B, FixedVectorType *TargetType,
+                                   Value *Source,
+                                   FixedVectorType *ArrElemVecTy) {
+    Type *TargetElemTy = TargetType->getElementType();
+    unsigned ScalarsPerArrayElement = ArrElemVecTy->getNumElements();
+    // Load each element of the array.
+    SmallVector<Value *, 4> LoadedElements;
+    SmallVector<Type *, 2> Types = {Source->getType(), Source->getType()};
+    for (unsigned I = 0; I < TargetType->getNumElements(); ++I) {
+      unsigned ArrayIndex = I / ScalarsPerArrayElement;
+      unsigned ElementIndexInArrayElem = I % ScalarsPerArrayElement;
+      // Create a GEP to access the i-th element of the array.
+      SmallVector<Value *, 4> Args;
+      Args.push_back(B.getInt1(/*Inbounds=*/false));
+      Args.push_back(Source);
+      Args.push_back(B.getInt32(0));
+      Args.push_back(ConstantInt::get(B.getInt32Ty(), ArrayIndex));
+      auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
+      GR->buildAssignPtr(B, ArrElemVecTy, ElementPtr);
+      Value *LoadVec = B.CreateLoad(ArrElemVecTy, ElementPtr);
+      buildAssignType(B, ArrElemVecTy, LoadVec);
+      LoadedElements.push_back(makeExtractElement(B, TargetElemTy, LoadVec,
+                                                  ElementIndexInArrayElem));
+    }
+    return buildVectorFromLoadedElements(B, TargetType, LoadedElements);
+  }
   // Loads elements from an array and constructs a vector.
   Value *loadVectorFromArray(IRBuilder<> &B, FixedVectorType *TargetType,
                              Value *Source) {
     // Load each element of the array.
     SmallVector<Value *, 4> LoadedElements;
-    for (unsigned i = 0; i < TargetType->getNumElements(); ++i) {
+    SmallVector<Type *, 2> Types = {Source->getType(), Source->getType()};
+    for (unsigned I = 0; I < TargetType->getNumElements(); ++I) {
       // Create a GEP to access the i-th element of the array.
-      SmallVector<Type *, 2> Types = {Source->getType(), Source->getType()};
       SmallVector<Value *, 4> Args;
-      Args.push_back(B.getInt1(false));
+      Args.push_back(B.getInt1(/*Inbounds=*/false));
       Args.push_back(Source);
       Args.push_back(B.getInt32(0));
-      Args.push_back(ConstantInt::get(B.getInt32Ty(), i));
+      Args.push_back(ConstantInt::get(B.getInt32Ty(), I));
       auto *ElementPtr = B.CreateIntrinsic(Intrinsic::spv_gep, {Types}, {Args});
       GR->buildAssignPtr(B, TargetType->getElementType(), ElementPtr);
 
@@ -187,21 +233,7 @@ class SPIRVLegalizePointerCast : public FunctionPass {
       buildAssignType(B, TargetType->getElementType(), Load);
       LoadedElements.push_back(Load);
     }
-
-    // Build the vector from the loaded elements.
-    Value *NewVector = PoisonValue::get(TargetType);
-    buildAssignType(B, TargetType, NewVector);
-
-    for (unsigned i = 0; i < TargetType->getNumElements(); ++i) {
-      Value *Index = B.getInt32(i);
-      SmallVector<Type *, 4> Types = {TargetType, TargetType,
-                                      TargetType->getElementType(),
-                                      Index->getType()};
-      SmallVector<Value *> Args = {NewVector, LoadedElements[i], Index};
-      NewVector = B.CreateIntrinsic(Intrinsic::spv_insertelt, {Types}, {Args});
-      buildAssignType(B, TargetType, NewVector);
-    }
-    return NewVector;
+    return buildVectorFromLoadedElements(B, TargetType, LoadedElements);
   }
 
   // Stores elements from a vector into an array.
@@ -256,6 +288,8 @@ class SPIRVLegalizePointerCast : public FunctionPass {
     auto *SAT = dyn_cast<ArrayType>(FromTy);
     auto *SVT = dyn_cast<FixedVectorType>(FromTy);
     auto *DVT = dyn_cast<FixedVectorType>(ToTy);
+    auto *MAT =
+        SAT ? dyn_cast<FixedVectorType>(SAT->getElementType()) : nullptr;
 
     B.SetInsertPoint(LI);
 
@@ -271,6 +305,8 @@ class SPIRVLegalizePointerCast : public FunctionPass {
       Output = loadVectorFromVector(B, SVT, DVT, OriginalOperand);
     else if (SAT && DVT && SAT->getElementType() == DVT->getElementType())
       Output = loadVectorFromArray(B, DVT, OriginalOperand);
+    else if (MAT && DVT && MAT->getElementType() == DVT->getElementType())
+      Output = loadVectorFromMatrixArray(B, DVT, OriginalOperand, MAT);
     else
       llvm_unreachable("Unimplemented implicit down-cast from load.");
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index aba9cf7962e68..54ead93f2d52d 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -37,30 +37,6 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
 
   SPIRVMergeRegionExitTargets() : FunctionPass(ID) {}
 
-  // Gather all the successors of |BB|.
-  // This function asserts if the terminator neither a branch, switch or return.
-  std::unordered_set<BasicBlock *> gatherSuccessors(BasicBlock *BB) {
-    std::unordered_set<BasicBlock *> output;
-    auto *T = BB->getTerminator();
-
-    if (auto *BI = dyn_cast<BranchInst>(T)) {
-      output.insert(BI->getSuccessor(0));
-      if (BI->isConditional())
-        output.insert(BI->getSuccessor(1));
-      return output;
-    }
-
-    if (auto *SI = dyn_cast<SwitchInst>(T)) {
-      output.insert(SI->getDefaultDest());
-      for (auto &Case : SI->cases())
-        output.insert(Case.getCaseSuccessor());
-      return output;
-    }
-
-    assert(isa<ReturnInst>(T) && "Unhandled terminator type.");
-    return output;
-  }
-
   /// Create a value in BB set to the value associated with the branch the block
   /// terminator will take.
   llvm::Value *createExitVariable(
@@ -69,18 +45,15 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
     auto *T = BB->getTerminator();
     if (isa<ReturnInst>(T))
       return nullptr;
+    if (auto *BI = dyn_cast<UncondBrInst>(T))
+      return TargetToValue.lookup(BI->getSuccessor());
 
     IRBuilder<> Builder(BB);
     Builder.SetInsertPoint(T);
 
-    if (auto *BI = dyn_cast<BranchInst>(T)) {
-
-      BasicBlock *LHSTarget = BI->getSuccessor(0);
-      BasicBlock *RHSTarget =
-          BI->isConditional() ? BI->getSuccessor(1) : nullptr;
-
-      Value *LHS = TargetToValue.lookup(LHSTarget);
-      Value *RHS = TargetToValue.lookup(RHSTarget);
+    if (auto *BI = dyn_cast<CondBrInst>(T)) {
+      Value *LHS = TargetToValue.lookup(BI->getSuccessor(0));
+      Value *RHS = TargetToValue.lookup(BI->getSuccessor(1));
 
       if (LHS == nullptr || RHS == nullptr)
         return LHS == nullptr ? RHS : LHS;
@@ -91,33 +64,6 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
     llvm_unreachable("Unhandled terminator type.");
   }
 
-  /// Replaces |BB|'s branch targets present in |ToReplace| with |NewTarget|.
-  void replaceBranchTargets(BasicBlock *BB,
-                            const SmallPtrSet<BasicBlock *, 4> &ToReplace,
-                            BasicBlock *NewTarget) {
-    auto *T = BB->getTerminator();
-    if (isa<ReturnInst>(T))
-      return;
-
-    if (auto *BI = dyn_cast<BranchInst>(T)) {
-      for (size_t i = 0; i < BI->getNumSuccessors(); i++) {
-        if (ToReplace.count(BI->getSuccessor(i)) != 0)
-          BI->setSuccessor(i, NewTarget);
-      }
-      return;
-    }
-
-    if (auto *SI = dyn_cast<SwitchInst>(T)) {
-      for (size_t i = 0; i < SI->getNumSuccessors(); i++) {
-        if (ToReplace.count(SI->getSuccessor(i)) != 0)
-          SI->setSuccessor(i, NewTarget);
-      }
-      return;
-    }
-
-    assert(false && "Unhandled terminator type.");
-  }
-
   AllocaInst *CreateVariable(Function &F, Type *Type,
                              BasicBlock::iterator Position) {
     const DataLayout &DL = F.getDataLayout();
@@ -132,7 +78,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
     // Gather all the exit targets for this region.
     SmallPtrSet<BasicBlock *, 4> ExitTargets;
     for (BasicBlock *Exit : CR->Exits) {
-      for (BasicBlock *Target : gatherSuccessors(Exit)) {
+      for (BasicBlock *Target : successors(Exit)) {
         if (CR->Blocks.count(Target) == 0)
           ExitTargets.insert(Target);
       }
@@ -191,8 +137,12 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
     }
 
     // Fix exit branches to redirect to the new exit.
-    for (auto Exit : CR->Exits)
-      replaceBranchTargets(Exit, ExitTargets, NewExitTarget);
+    for (auto Exit : CR->Exits) {
+      Instruction *T = Exit->getTerminator();
+      for (auto I = succ_begin(T), E = succ_end(T); I != E; ++I)
+        if (ExitTargets.contains(*I))
+          I.getUse()->set(NewExitTarget);
+    }
 
     CR = CR->Parent;
     while (CR) {
@@ -223,8 +173,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
 
     std::unordered_set<BasicBlock *> ExitTargets;
     for (auto *Exit : CR->Exits) {
-      auto Set = gatherSuccessors(Exit);
-      for (auto *BB : Set) {
+      for (auto *BB : successors(Exit)) {
         if (CR->Blocks.count(BB) == 0)
           ExitTargets.insert(BB);
       }
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 38bc72ff154fa..f9aaf3ab44929 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -215,6 +215,9 @@ void SPIRVModuleAnalysis::setBaseInfo(const Module &M) {
   MAI.Reqs.getAndAddRequirements(SPIRV::OperandCategory::AddressingModelOperand,
                                  MAI.Addr, *ST);
 
+  if (MAI.Mem == SPIRV::MemoryModel::VulkanKHR)
+    MAI.Reqs.addExtension(SPIRV::Extension::SPV_KHR_vulkan_memory_model);
+
   if (!ST->isShader()) {
     // TODO: check if it's required by default.
     MAI.ExtInstSetMap[static_cast<unsigned>(
@@ -940,7 +943,8 @@ void RequirementHandler::initAvailableCapabilitiesForVulkan(
                     Capability::StorageBufferArrayDynamicIndexing,
                     Capability::StorageImageArrayDynamicIndexing,
                     Capability::DerivativeControl, Capability::MinLod,
-                    Capability::ImageGatherExtended});
+                    Capability::ImageGatherExtended, Capability::Addresses,
+                    Capability::VulkanMemoryModelKHR});
 
   // Became core in Vulkan 1.2
   if (ST.isAtLeastSPIRVVer(VersionTuple(1, 5))) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index ea634fb616f10..d2befa50789fa 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -213,7 +213,7 @@ static void visit(BasicBlock &Start, std::function<bool(BasicBlock *)> op) {
 // associated merge instruction gets updated accordingly.
 static void replaceIfBranchTargets(BasicBlock *BB, BasicBlock *OldTarget,
                                    BasicBlock *NewTarget) {
-  auto *BI = cast<BranchInst>(BB->getTerminator());
+  auto *BI = cast<CondBrInst>(BB->getTerminator());
 
   // 1. Replace all matching successors.
   for (size_t i = 0; i < BI->getNumSuccessors(); i++) {
@@ -221,10 +221,6 @@ static void replaceIfBranchTargets(BasicBlock *BB, BasicBlock *OldTarget,
       BI->setSuccessor(i, NewTarget);
   }
 
-  // Branch was unconditional, no fixup required.
-  if (BI->isUnconditional())
-    return;
-
   // Branch had 2 successors, maybe now both are the same?
   if (BI->getSuccessor(0) != BI->getSuccessor(1))
     return;
@@ -263,8 +259,13 @@ static void replaceBranchTargets(BasicBlock *BB, BasicBlock *OldTarget,
   auto *T = BB->getTerminator();
   if (isa<ReturnInst>(T))
     return;
+  if (auto *BI = dyn_cast<UncondBrInst>(T)) {
+    if (BI->getSuccessor() == OldTarget)
+      BI->setSuccessor(NewTarget);
+    return;
+  }
 
-  if (isa<BranchInst>(T))
+  if (isa<CondBrInst>(T))
     return replaceIfBranchTargets(BB, OldTarget, NewTarget);
 
   if (auto *SI = dyn_cast<SwitchInst>(T)) {
@@ -519,18 +520,15 @@ class SPIRVStructurizer : public FunctionPass {
     auto *T = BB->getTerminator();
     if (isa<ReturnInst>(T))
       return nullptr;
+    if (auto *BI = dyn_cast<UncondBrInst>(T))
+      return TargetToValue.lookup(BI->getSuccessor());
 
     IRBuilder<> Builder(BB);
     Builder.SetInsertPoint(T);
 
-    if (auto *BI = dyn_cast<BranchInst>(T)) {
-
-      BasicBlock *LHSTarget = BI->getSuccessor(0);
-      BasicBlock *RHSTarget =
-          BI->isConditional() ? BI->getSuccessor(1) : nullptr;
-
-      Value *LHS = TargetToValue.lookup(LHSTarget);
-      Value *RHS = TargetToValue.lookup(RHSTarget);
+    if (auto *BI = dyn_cast<CondBrInst>(T)) {
+      Value *LHS = TargetToValue.lookup(BI->getSuccessor(0));
+      Value *RHS = TargetToValue.lookup(BI->getSuccessor(1));
 
       if (LHS == nullptr || RHS == nullptr)
         return LHS == nullptr ? RHS : LHS;
@@ -580,9 +578,7 @@ class SPIRVStructurizer : public FunctionPass {
       // do however is to make is legal on the SPIR-V point of view, hence
       // adding an unreachable merge block.
       if (Merge == nullptr) {
-        BranchInst *Br = cast<BranchInst>(BB.getTerminator());
-        assert(Br->isUnconditional());
-
+        UncondBrInst *Br = cast<UncondBrInst>(BB.getTerminator());
         Merge = CreateUnreachable(F);
         Builder.SetInsertPoint(Br);
         Builder.CreateCondBr(Builder.getFalse(), Merge, Br->getSuccessor(0));
@@ -709,11 +705,11 @@ class SPIRVStructurizer : public FunctionPass {
         if (getDesignatedContinueBlock(MergeInstructions[0]) == nullptr) {
           BasicBlock *Unreachable = CreateUnreachable(F);
 
-          BranchInst *BI = cast<BranchInst>(Header->getTerminator());
+          Instruction *Term = Header->getTerminator();
           IRBuilder<> Builder(Header);
-          Builder.SetInsertPoint(BI);
+          Builder.SetInsertPoint(Term);
           Builder.CreateCondBr(Builder.getTrue(), NewBlock, Unreachable);
-          BI->eraseFromParent();
+          Term->eraseFromParent();
         }
 
         Header = NewBlock;
diff --git a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
index d65d8ec53c6d0..4aca400e827d7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVSubtarget.cpp
@@ -11,12 +11,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "SPIRVSubtarget.h"
+
+#include "MCTargetDesc/SPIRVBaseInfo.h"
 #include "SPIRV.h"
 #include "SPIRVCommandLine.h"
 #include "SPIRVGlobalRegistry.h"
 #include "SPIRVLegalizerInfo.h"
 #include "SPIRVRegisterBankInfo.h"
 #include "SPIRVTargetMachine.h"
+
 #include "llvm/TargetParser/Host.h"
 
 using namespace llvm;
@@ -207,6 +210,34 @@ void SPIRVSubtarget::resolveEnvFromModule(const Module &M) {
     }
   }
 
+  if (!HasShaderAttr) {
+    if (auto *MemModel = M.getNamedMetadata("spirv.MemoryModel")) {
+      if (MemModel->getNumOperands() == 0)
+        report_fatal_error("Invalid spirv.MemoryModel metadata");
+      auto *MemMD = MemModel->getOperand(0);
+      if (MemMD->getNumOperands() < 2)
+        report_fatal_error("Invalid spirv.MemoryModel operand");
+      unsigned MemModelVal =
+          mdconst::extract<ConstantInt>(MemMD->getOperand(1))->getZExtValue();
+      switch (MemModelVal) {
+      case SPIRV::MemoryModel::Simple:
+      case SPIRV::MemoryModel::GLSL450:
+        HasShaderAttr = true;
+        break;
+      case SPIRV::MemoryModel::VulkanKHR:
+        HasShaderAttr = true;
+        AvailableExtensions.insert(
+            SPIRV::Extension::SPV_KHR_vulkan_memory_model);
+        break;
+      case SPIRV::MemoryModel::OpenCL:
+        break;
+      default:
+        report_fatal_error(
+            "Unknown memory model in spirv.MemoryModel metadata");
+      }
+    }
+  }
+
   setEnv(HasShaderAttr ? Shader : Kernel);
 }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index 8a152cd58d517..d541ead5ac22c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -347,7 +347,11 @@ bool matchPeeledArrayPattern(const StructType *Ty, Type *&OriginalElementType,
 Type *reconstitutePeeledArrayType(Type *Ty);
 
 inline bool hasInitializer(const GlobalVariable *GV) {
-  return GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer());
+  if (!GV->hasInitializer())
+    return false;
+  if (const auto *Init = GV->getInitializer(); isa<UndefValue>(Init))
+    return GV->isConstant() && Init->getType()->isAggregateType();
+  return true;
 }
 
 // True if this is an instance of TypedPointerType.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index fae06111f0163..10911345cedea 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -189,7 +189,7 @@ class WebAssemblyFastISel final : public FastISel {
   bool selectBitCast(const Instruction *I);
   bool selectLoad(const Instruction *I);
   bool selectStore(const Instruction *I);
-  bool selectBr(const Instruction *I);
+  bool selectCondBr(const Instruction *I);
   bool selectRet(const Instruction *I);
   bool selectUnreachable(const Instruction *I);
 
@@ -1498,7 +1498,7 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) {
   return true;
 }
 
-bool WebAssemblyFastISel::selectBr(const Instruction *I) {
+bool WebAssemblyFastISel::selectCondBr(const Instruction *I) {
   const auto *Br = cast<CondBrInst>(I);
 
   MachineBasicBlock *TBB = FuncInfo.getMBB(Br->getSuccessor(0));
@@ -1610,7 +1610,7 @@ bool WebAssemblyFastISel::fastSelectInstruction(const Instruction *I) {
   case Instruction::Store:
     return selectStore(I);
   case Instruction::CondBr:
-    return selectBr(I);
+    return selectCondBr(I);
   case Instruction::Ret:
     return selectRet(I);
   case Instruction::Unreachable:
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 2541b0433ab59..4a63be7ce9e34 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -16,6 +16,7 @@
 #include "WebAssemblyISelLowering.h"
 #include "WebAssemblyTargetMachine.h"
 #include "WebAssemblyUtilities.h"
+#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
@@ -69,6 +70,12 @@ class WebAssemblyDAGToDAGISel final : public SelectionDAGISel {
 
   bool SelectAddrOperands32(SDValue Op, SDValue &Offset, SDValue &Addr);
   bool SelectAddrOperands64(SDValue Op, SDValue &Offset, SDValue &Addr);
+  bool SelectAtomicAddrOperands(SDNode *Op, SDValue N, SDValue &Offset,
+                                SDValue &Addr, SDValue &Order, bool Is64);
+  bool SelectAtomicAddrOperands32(SDNode *Op, SDValue N, SDValue &Offset,
+                                  SDValue &Addr, SDValue &Order);
+  bool SelectAtomicAddrOperands64(SDNode *Op, SDValue N, SDValue &Offset,
+                                  SDValue &Addr, SDValue &Order);
 
 // Include the pieces autogenerated from the target description.
 #include "WebAssemblyGenDAGISel.inc"
@@ -174,6 +181,25 @@ static APInt encodeFunctionSignature(SelectionDAG *DAG, SDLoc &DL,
   return Sig;
 }
 
+static unsigned getWebAssemblyMemoryOrder(AtomicOrdering Ordering) {
+  unsigned OrderVal = wasm::WASM_MEM_ORDER_SEQ_CST;
+  switch (Ordering) {
+  case AtomicOrdering::Unordered:
+  case AtomicOrdering::Monotonic:
+  case AtomicOrdering::Acquire:
+  case AtomicOrdering::Release:
+  case AtomicOrdering::AcquireRelease:
+    OrderVal = wasm::WASM_MEM_ORDER_ACQ_REL;
+    break;
+  case AtomicOrdering::SequentiallyConsistent:
+    OrderVal = wasm::WASM_MEM_ORDER_SEQ_CST;
+    break;
+  default:
+    llvm_unreachable("Invalid atomic ordering");
+  }
+  return OrderVal;
+}
+
 void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -186,7 +212,6 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
   auto GlobalGetIns = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
                                         : WebAssembly::GLOBAL_GET_I32;
 
-  // Few custom selection stuff.
   SDLoc DL(Node);
   MachineFunction &MF = CurDAG->getMachineFunction();
   switch (Node->getOpcode()) {
@@ -207,17 +232,22 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
                                      Node->getOperand(0) // inchain
       );
       break;
-    case SyncScope::System:
-      // Currently wasm only supports sequentially consistent atomics, so we
-      // always set the order to 0 (sequentially consistent).
+    case SyncScope::System: {
+      unsigned Order = wasm::WASM_MEM_ORDER_SEQ_CST;
+      if (MF.getSubtarget<WebAssemblySubtarget>().hasRelaxedAtomics()) {
+        auto Ordering =
+            static_cast<AtomicOrdering>(Node->getConstantOperandVal(1));
+        Order = getWebAssemblyMemoryOrder(Ordering);
+      }
       Fence = CurDAG->getMachineNode(
           WebAssembly::ATOMIC_FENCE,
-          DL,                                         // debug loc
-          MVT::Other,                                 // outchain type
-          CurDAG->getTargetConstant(0, DL, MVT::i32), // order
-          Node->getOperand(0)                         // inchain
+          DL,                                             // debug loc
+          MVT::Other,                                     // outchain type
+          CurDAG->getTargetConstant(Order, DL, MVT::i32), // order
+          Node->getOperand(0)                             // inchain
       );
       break;
+    }
     default:
       llvm_unreachable("Unknown scope!");
     }
@@ -548,6 +578,65 @@ bool WebAssemblyDAGToDAGISel::SelectAddrOperands64(SDValue Op, SDValue &Offset,
   return SelectAddrOperands(MVT::i64, WebAssembly::CONST_I64, Op, Offset, Addr);
 }
 
+static MemSDNode *findMemSDNode(SDNode *N) {
+  while (N) {
+    if (auto *MemNode = dyn_cast<MemSDNode>(N))
+      return MemNode;
+    switch (N->getOpcode()) {
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND:
+    case ISD::ANY_EXTEND:
+    case ISD::SIGN_EXTEND_INREG:
+    case ISD::AssertZext:
+    case ISD::AssertSext:
+    case ISD::TRUNCATE:
+    case ISD::BITCAST:
+    case ISD::AND:
+      N = N->getOperand(0).getNode();
+      break;
+    default:
+      return nullptr;
+    }
+  }
+  return nullptr;
+}
+
+bool WebAssemblyDAGToDAGISel::SelectAtomicAddrOperands(SDNode *Op, SDValue N,
+                                                       SDValue &Offset,
+                                                       SDValue &Addr,
+                                                       SDValue &Order,
+                                                       bool Is64) {
+  auto *MemNode = findMemSDNode(Op);
+  if (!MemNode)
+    return false;
+
+  bool Match = Is64 ? SelectAddrOperands64(N, Offset, Addr)
+                    : SelectAddrOperands32(N, Offset, Addr);
+  if (!Match)
+    return false;
+
+  auto Ordering = MemNode->getMergedOrdering();
+  unsigned OrderVal = wasm::WASM_MEM_ORDER_SEQ_CST;
+  if (Subtarget->hasRelaxedAtomics())
+    OrderVal = getWebAssemblyMemoryOrder(Ordering);
+  Order = CurDAG->getTargetConstant(OrderVal, SDLoc(Op), MVT::i32);
+  return true;
+}
+
+bool WebAssemblyDAGToDAGISel::SelectAtomicAddrOperands32(SDNode *Op, SDValue N,
+                                                         SDValue &Offset,
+                                                         SDValue &Addr,
+                                                         SDValue &Order) {
+  return SelectAtomicAddrOperands(Op, N, Offset, Addr, Order, /*Is64=*/false);
+}
+
+bool WebAssemblyDAGToDAGISel::SelectAtomicAddrOperands64(SDNode *Op, SDValue N,
+                                                         SDValue &Offset,
+                                                         SDValue &Addr,
+                                                         SDValue &Order) {
+  return SelectAtomicAddrOperands(Op, N, Offset, Addr, Order, /*Is64=*/true);
+}
+
 /// This pass converts a legalized DAG into a WebAssembly-specific DAG, ready
 /// for instruction scheduling.
 FunctionPass *llvm::createWebAssemblyISelDag(WebAssemblyTargetMachine &TM,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index e9de06babc8f1..874ea2be79a33 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -199,6 +199,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
     // Combine wide-vector muls, with extend inputs, to extmul_half.
     setTargetDAGCombine(ISD::MUL);
+    setTargetDAGCombine(ISD::SHL);
 
     // Combine vector mask reductions into alltrue/anytrue
     setTargetDAGCombine(ISD::SETCC);
@@ -2806,7 +2807,6 @@ static SDValue unrollVectorShift(SDValue Op, SelectionDAG &DAG) {
 SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
                                               SelectionDAG &DAG) const {
   SDLoc DL(Op);
-
   // Only manually lower vector shifts
   assert(Op.getSimpleValueType().isVector());
 
@@ -3336,6 +3336,25 @@ static SDValue performBitcastCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue performBitmaskCombine(SDNode *N, SelectionDAG &DAG) {
+  // bitmask (setcc <X>, 0, setlt) => bitmask X
+  assert(N->getOpcode() == ISD::INTRINSIC_WO_CHAIN);
+  using namespace llvm::SDPatternMatch;
+
+  if (N->getConstantOperandVal(0) != Intrinsic::wasm_bitmask)
+    return SDValue();
+
+  SDValue LHS;
+  if (!sd_match(N->getOperand(1), m_c_SetCC(m_Value(LHS), m_Zero(),
+                                            m_SpecificCondCode(ISD::SETLT))))
+    return SDValue();
+
+  SDLoc DL(N);
+  return DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, N->getValueType(0),
+      {DAG.getConstant(Intrinsic::wasm_bitmask, DL, MVT::i32), LHS});
+}
+
 static SDValue performAnyAllCombine(SDNode *N, SelectionDAG &DAG) {
   // any_true (setcc <X>, 0, eq) => (not (all_true X))
   // all_true (setcc <X>, 0, eq) => (not (any_true X))
@@ -3719,6 +3738,76 @@ SDValue performConvertFPCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// Wide vector shift operations such as v8i32 with sign-extended
+// operands cause Type Legalizer crashes because the target-specific
+// extension nodes cannot be directly mapped to the 256-bit size.
+//
+// To resolve the crash and optimize performance, we intercept the
+// illegal v8i32 shift in DAGCombine. We convert the shift amounts
+// into multipliers and manually split the vector into two v4i32 halves.
+//
+// Before: t1: v8i32 = shl (sign_extend v8i16), const_vec
+// After : t2: v4i32 = mul (ext_low_s v8i16), (ext_low_s narrow_vec)
+//         t3: v4i32 = mul (ext_high_s v8i16), (ext_high_s narrow_vec)
+//         t4: v8i32 = concat_vectors t2, t3
+static SDValue performShiftCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  assert(N->getOpcode() == ISD::SHL);
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v8i32)
+    return SDValue();
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned ExtOpc = LHS.getOpcode();
+  if (ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND)
+    return SDValue();
+
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue ExtendIn = LHS.getOperand(0);
+  EVT FromVT = ExtendIn.getValueType();
+  if (FromVT != MVT::v8i16)
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BitWidth = FromVT.getScalarSizeInBits();
+  bool IsSigned = (ExtOpc == ISD::SIGN_EXTEND);
+  unsigned MaxValidShift = IsSigned ? (BitWidth - 1) : BitWidth;
+  SmallVector<SDValue, 16> MulConsts;
+  for (unsigned I = 0; I < NumElts; ++I) {
+    auto *C = dyn_cast<ConstantSDNode>(RHS.getOperand(I));
+    if (!C)
+      return SDValue();
+
+    const APInt &ShiftAmt = C->getAPIntValue();
+    if (ShiftAmt.uge(MaxValidShift))
+      return SDValue();
+
+    APInt MulAmt = APInt(BitWidth, 1).shl(ShiftAmt);
+    MulConsts.push_back(DAG.getConstant(MulAmt, DL, FromVT.getScalarType(),
+                                        /*isTarget=*/false, /*isOpaque=*/true));
+  }
+
+  SDValue NarrowConst = DAG.getBuildVector(FromVT, DL, MulConsts);
+  unsigned ExtLowOpc =
+      IsSigned ? WebAssemblyISD::EXTEND_LOW_S : WebAssemblyISD::EXTEND_LOW_U;
+  unsigned ExtHighOpc =
+      IsSigned ? WebAssemblyISD::EXTEND_HIGH_S : WebAssemblyISD::EXTEND_HIGH_U;
+
+  EVT HalfVT = MVT::v4i32;
+  SDValue LHSLo = DAG.getNode(ExtLowOpc, DL, HalfVT, ExtendIn);
+  SDValue LHSHi = DAG.getNode(ExtHighOpc, DL, HalfVT, ExtendIn);
+  SDValue RHSLo = DAG.getNode(ExtLowOpc, DL, HalfVT, NarrowConst);
+  SDValue RHSHi = DAG.getNode(ExtHighOpc, DL, HalfVT, NarrowConst);
+  SDValue MulLo = DAG.getNode(ISD::MUL, DL, HalfVT, LHSLo, RHSLo);
+  SDValue MulHi = DAG.getNode(ISD::MUL, DL, HalfVT, LHSHi, RHSHi);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, MulLo, MulHi);
+}
+
 SDValue
 WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -3750,9 +3839,14 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
     return performConvertFPCombine(N, DCI.DAG);
   case ISD::TRUNCATE:
     return performTruncateCombine(N, DCI);
-  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN: {
+    if (SDValue V = performBitmaskCombine(N, DCI.DAG))
+      return V;
     return performAnyAllCombine(N, DCI.DAG);
+  }
   case ISD::MUL:
     return performMulCombine(N, DCI);
+  case ISD::SHL:
+    return performShiftCombine(N, DCI);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index c10c8805f8c54..90b8506fdba8d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -11,6 +11,13 @@
 ///
 //===----------------------------------------------------------------------===//
 
+def AtomicAddrOps32 : ComplexPattern<i32, 3, "SelectAtomicAddrOperands32"> {
+  let WantsRoot = 1;
+}
+def AtomicAddrOps64 : ComplexPattern<i64, 3, "SelectAtomicAddrOperands64"> {
+  let WantsRoot = 1;
+}
+
 let UseNamedOperandTable = 1 in
 multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
                     list<dag> pattern_r, string asmstr_r,
@@ -150,11 +157,11 @@ multiclass AtomicLoad<WebAssemblyRegClass rc, string name, int atomic_op> {
 }
 
 multiclass AtomicLoadPat<ValueType ty, SDPatternOperator kind, string Name> {
-  def : Pat<(ty (kind (AddrOps32 offset32_op:$offset, I32:$addr))),
-            (!cast<NI>(Name # "_A32") 0, 0, offset32_op:$offset, I32:$addr)>,
+  def : Pat<(ty (kind (AtomicAddrOps32 offset32_op:$offset, I32:$addr, i32imm:$order))),
+            (!cast<NI>(Name # "_A32") $order, 0, offset32_op:$offset, I32:$addr)>,
         Requires<[HasAddr32, HasAtomics]>;
-  def : Pat<(ty (kind (AddrOps64 offset64_op:$offset, I64:$addr))),
-            (!cast<NI>(Name # "_A64") 0, 0, offset64_op:$offset, I64:$addr)>,
+  def : Pat<(ty (kind (AtomicAddrOps64 offset64_op:$offset, I64:$addr, i32imm:$order))),
+            (!cast<NI>(Name # "_A64") $order, 0, offset64_op:$offset, I64:$addr)>,
         Requires<[HasAddr64, HasAtomics]>;
 }
 
@@ -245,18 +252,12 @@ multiclass AtomicStore<WebAssemblyRegClass rc, string name, int atomic_op> {
 defm ATOMIC_STORE_I32 : AtomicStore<I32, "i32.atomic.store", 0x17>;
 defm ATOMIC_STORE_I64 : AtomicStore<I64, "i64.atomic.store", 0x18>;
 
-// We used to need an 'atomic' version of store patterns because store and atomic_store
-// nodes have different operand orders.
-//
-// TODO: This is no longer true and atomic_store and store patterns
-// can be unified.
-
 multiclass AStorePat<ValueType ty, PatFrag kind, string inst> {
-  def : Pat<(kind ty:$val, (AddrOps32 offset32_op:$offset, I32:$addr)),
-            (!cast<NI>(inst#_A32) 0, 0, $offset, $addr, $val)>,
+  def : Pat<(kind ty:$val, (AtomicAddrOps32 offset32_op:$offset, I32:$addr, i32imm:$order)),
+            (!cast<NI>(inst#_A32) $order, 0, $offset, $addr, $val)>,
         Requires<[HasAddr32, HasAtomics]>;
-  def : Pat<(kind ty:$val, (AddrOps64 offset64_op:$offset, I64:$addr)),
-            (!cast<NI>(inst#_A64) 0, 0, $offset, $addr, $val)>,
+  def : Pat<(kind ty:$val, (AtomicAddrOps64 offset64_op:$offset, I64:$addr, i32imm:$order)),
+            (!cast<NI>(inst#_A64) $order, 0, $offset, $addr, $val)>,
         Requires<[HasAddr64, HasAtomics]>;
 }
 defm : AStorePat<i32, atomic_store_32, "ATOMIC_STORE_I32">;
@@ -393,11 +394,13 @@ defm ATOMIC_RMW32_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw32.xchg_u", 0x47>;
 
 multiclass BinRMWPat<ValueType ty, PatFrag kind, string inst> {
-  def : Pat<(ty (kind (AddrOps32 offset32_op:$offset, I32:$addr), ty:$val)),
-            (!cast<NI>(inst#_A32) 0, 0, $offset, $addr, $val)>,
+  def : Pat<(ty (kind (AtomicAddrOps32 offset32_op:$offset, I32:$addr,
+                       i32imm:$order), ty:$val)),
+            (!cast<NI>(inst#_A32) $order, 0, $offset, $addr, $val)>,
         Requires<[HasAddr32, HasAtomics]>;
-  def : Pat<(ty (kind (AddrOps64 offset64_op:$offset, I64:$addr), ty:$val)),
-            (!cast<NI>(inst#_A64) 0, 0, $offset, $addr, $val)>,
+  def : Pat<(ty (kind (AtomicAddrOps64 offset64_op:$offset, I64:$addr,
+                       i32imm:$order), ty:$val)),
+            (!cast<NI>(inst#_A64) $order, 0, $offset, $addr, $val)>,
         Requires<[HasAddr64, HasAtomics]>;
 }
 
@@ -536,11 +539,11 @@ defm ATOMIC_RMW32_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw32.cmpxchg_u", 0x4e>;
 
 multiclass TerRMWPat<ValueType ty, PatFrag kind, string inst> {
-  def : Pat<(ty (kind (AddrOps32 offset32_op:$offset, I32:$addr), ty:$exp, ty:$new)),
-            (!cast<NI>(inst#_A32) 0, 0, $offset, $addr, $exp, $new)>,
+  def : Pat<(ty (kind (AtomicAddrOps32 offset32_op:$offset, I32:$addr, i32imm:$order), ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A32) $order, 0, $offset, $addr, $exp, $new)>,
         Requires<[HasAddr32, HasAtomics]>;
-  def : Pat<(ty (kind (AddrOps64 offset64_op:$offset, I64:$addr), ty:$exp, ty:$new)),
-            (!cast<NI>(inst#_A64) 0, 0, $offset, $addr, $exp, $new)>,
+  def : Pat<(ty (kind (AtomicAddrOps64 offset64_op:$offset, I64:$addr, i32imm:$order), ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A64) $order, 0, $offset, $addr, $exp, $new)>,
         Requires<[HasAddr64, HasAtomics]>;
 }
 
diff --git a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
index bbc7f464eda4a..d9830e93f0c1f 100644
--- a/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/GISel/X86InstructionSelector.cpp
@@ -823,8 +823,9 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
     NewOpc = X86::MOV32ri;
     break;
   case 64:
-    // TODO: in case isUInt<32>(Val), X86::MOV32ri can be used
-    if (isInt<32>(Val))
+    if (isUInt<32>(Val))
+      NewOpc = X86::MOV32ri64;
+    else if (isInt<32>(Val))
       NewOpc = X86::MOV64ri32;
     else
       NewOpc = X86::MOV64ri;
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index eca763735c315..048d5df098e0f 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -371,6 +371,8 @@ def FeatureCF : SubtargetFeature<"cf", "HasCF", "true",
                                  "Support conditional faulting">;
 def FeatureZU : SubtargetFeature<"zu", "HasZU", "true",
                                  "Support zero-upper SETcc/IMUL">;
+def FeatureJMPABS : SubtargetFeature<"jmpabs", "HasJMPABS", "true",
+                                     "Support 64-bit absolute JMP">;
 def FeatureUseGPR32InInlineAsm
     : SubtargetFeature<"inline-asm-use-gpr32", "UseInlineAsmGPR32", "true",
                        "Enable use of GPR32 in inline assembly for APX">;
@@ -777,6 +779,14 @@ def TuningUseGLMDivSqrtCosts
     : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
         "Use Goldmont specific floating point div/sqrt costs">;
 
+def TuningNDDImm
+    : SubtargetFeature<"prefer-ndd-imm", "HasNDDI",
+          "true", "Prefer NDD immediate variant">;
+
+def TuningNDDMem
+    : SubtargetFeature<"prefer-ndd-mem", "HasNDDM",
+          "true", "Prefer NDD memory addressing">;
+
 // Starting with Redwood Cove architecture, the branch has branch taken hint
 // (i.e., instruction prefix 3EH).
 def TuningBranchHint: SubtargetFeature<"branch-hint", "HasBranchHint", "true",
@@ -1188,6 +1198,7 @@ def ProcessorFeatures {
                                                   FeaturePPX,
                                                   FeatureNDD,
                                                   FeatureNF,
+                                                  FeatureJMPABS,
                                                   FeatureMOVRS,
                                                   FeatureAMXMOVRS,
                                                   FeatureAMXAVX512,
@@ -1361,6 +1372,7 @@ def ProcessorFeatures {
                                                   FeatureNF,
                                                   FeatureNDD,
                                                   FeatureZU,
+                                                  FeatureJMPABS,
                                                   FeatureCCMP,
                                                   FeaturePREFETCHI];
   list<SubtargetFeature> NVLFeatures =
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 9fe7902e39fb9..7290ab2ed20f2 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -899,6 +899,11 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
       }
       return false;
     }
+  } else {
+    // Constraint 'p' requires modifier 'a'.
+    const InlineAsm::Flag Flags(MI->getOperand(OpNo - 1).getImm());
+    if (Flags.getMemoryConstraintID() == InlineAsm::ConstraintCode::p)
+      return true;
   }
   if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
     PrintIntelMemReference(MI, OpNo, O);
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 0093a6b29d226..3dbe6d14c610e 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3200,11 +3200,27 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isCallingConvWin64(CC);
 
-  // If the return type is illegal, don't bother to promote it, just fall back
-  // to DAG ISel.
-  MVT RetVT;
-  if (!isTypeLegal(CLI.RetTy, RetVT) && !CLI.RetTy->isVoidTy())
-    return false;
+  // If the return type is illegal, check if the ABI requires a type conversion
+  // that FastISel cannot handle. Fall back to DAG ISel in such cases.
+  // For example, bfloat is returned as f16 in XMM0, however FastISel would
+  // assign f32 register type and store it in FuncInfo.ValueMap. This would
+  // cause DAG incorrectly perform type conversion from f32 to bfloat after get
+  // the value from FuncInfo.ValueMap.
+  // However, i1 is promoted to i8 and return i8 defined by ABI, so FastISel can
+  // lower it without switching to DAGISel.
+  MVT RetVT = MVT::Other;
+  if (!isTypeLegal(CLI.RetTy, RetVT) && !CLI.RetTy->isVoidTy()) {
+    if (RetVT == MVT::Other)
+      return false; // Unknown type, let DAG ISel handle it.
+
+    // RetVT is not MVT::Other, it must be simple now. It is something rely on
+    // the logic of isTypeLegal().
+    MVT ABIVT = TLI.getRegisterTypeForCallingConv(CLI.RetTy->getContext(),
+                                                  CLI.CallConv, RetVT);
+    MVT RegVT = TLI.getRegisterType(CLI.RetTy->getContext(), RetVT);
+    if (ABIVT != RegVT)
+      return false;
+  }
 
   // Call / invoke instructions with NoCfCheck attribute require special
   // handling.
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index cc846e0d1492e..be95168f2de00 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -214,9 +214,11 @@ namespace {
     bool matchVectorAddressRecursively(SDValue N, X86ISelAddressMode &AM,
                                        unsigned Depth);
     bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
-    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
-                    SDValue &Scale, SDValue &Index, SDValue &Disp,
-                    SDValue &Segment);
+    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
+                    SDValue &Index, SDValue &Disp, SDValue &Segment,
+                    bool HasNDDM = true);
+    bool selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale,
+                       SDValue &Index, SDValue &Disp, SDValue &Segment);
     bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
                           SDValue ScaleOp, SDValue &Base, SDValue &Scale,
                           SDValue &Index, SDValue &Disp, SDValue &Segment);
@@ -273,6 +275,8 @@ namespace {
       Scale = getI8Imm(AM.Scale, DL);
 
 #define GET_ND_IF_ENABLED(OPC) (Subtarget->hasNDD() ? OPC##_ND : OPC)
+#define GET_NDM_IF_ENABLED(OPC)                                              \
+    (Subtarget->hasNDD() && Subtarget->hasNDDM() ? OPC##_ND : OPC)
       // Negate the index if needed.
       if (AM.NegateIndex) {
         unsigned NegOpc;
@@ -3018,8 +3022,8 @@ bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
 /// is always a load, store, atomic node, or null.  It is only null when
 /// checking memory operands for inline asm nodes.
 bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
-                                 SDValue &Scale, SDValue &Index,
-                                 SDValue &Disp, SDValue &Segment) {
+                                 SDValue &Scale, SDValue &Index, SDValue &Disp,
+                                 SDValue &Segment, bool HasNDDM) {
   X86ISelAddressMode AM;
 
   if (Parent &&
@@ -3049,10 +3053,20 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
   if (matchAddress(N, AM))
     return false;
 
+  if (!HasNDDM && !AM.isRIPRelative())
+    return false;
+
   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
   return true;
 }
 
+bool X86DAGToDAGISel::selectNDDAddr(SDNode *Parent, SDValue N, SDValue &Base,
+                                    SDValue &Scale, SDValue &Index,
+                                    SDValue &Disp, SDValue &Segment) {
+  return selectAddr(Parent, N, Base, Scale, Index, Disp, Segment,
+                    Subtarget->hasNDDM());
+}
+
 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
   // Cannot use 32 bit constants to reference objects in kernel/large code
   // model.
@@ -5050,6 +5064,21 @@ VPTESTM_CASE(v32i16, WZ##SUFFIX)
 #undef VPTESTM_CASE
 }
 
+static void orderRegForMul(SDValue &N0, SDValue &N1, const unsigned LoReg,
+                           const MachineRegisterInfo &MRI) {
+  auto GetPhysReg = [&](SDValue V) -> Register {
+    if (V.getOpcode() != ISD::CopyFromReg)
+      return Register();
+    Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
+    if (Reg.isVirtual())
+      return MRI.getLiveInPhysReg(Reg);
+    return Reg;
+  };
+
+  if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
+    std::swap(N0, N1);
+}
+
 // Try to create VPTESTM instruction. If InMask is not null, it will be used
 // to form a masked operation.
 bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
@@ -5633,23 +5662,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       default: llvm_unreachable("Unexpected opcode!");
       case ISD::ADD:
         ROpc = GET_ND_IF_ENABLED(X86::ADD8rr);
-        MOpc = GET_ND_IF_ENABLED(X86::ADD8rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::ADD8rm);
         break;
       case ISD::SUB:
         ROpc = GET_ND_IF_ENABLED(X86::SUB8rr);
-        MOpc = GET_ND_IF_ENABLED(X86::SUB8rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::SUB8rm);
         break;
       case ISD::AND:
         ROpc = GET_ND_IF_ENABLED(X86::AND8rr);
-        MOpc = GET_ND_IF_ENABLED(X86::AND8rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::AND8rm);
         break;
       case ISD::OR:
         ROpc = GET_ND_IF_ENABLED(X86::OR8rr);
-        MOpc = GET_ND_IF_ENABLED(X86::OR8rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::OR8rm);
         break;
       case ISD::XOR:
         ROpc = GET_ND_IF_ENABLED(X86::XOR8rr);
-        MOpc = GET_ND_IF_ENABLED(X86::XOR8rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::XOR8rm);
         break;
       }
       break;
@@ -5658,23 +5687,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       default: llvm_unreachable("Unexpected opcode!");
       case ISD::ADD:
         ROpc = GET_ND_IF_ENABLED(X86::ADD16rr);
-        MOpc = GET_ND_IF_ENABLED(X86::ADD16rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::ADD16rm);
         break;
       case ISD::SUB:
         ROpc = GET_ND_IF_ENABLED(X86::SUB16rr);
-        MOpc = GET_ND_IF_ENABLED(X86::SUB16rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::SUB16rm);
         break;
       case ISD::AND:
         ROpc = GET_ND_IF_ENABLED(X86::AND16rr);
-        MOpc = GET_ND_IF_ENABLED(X86::AND16rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::AND16rm);
         break;
       case ISD::OR:
         ROpc = GET_ND_IF_ENABLED(X86::OR16rr);
-        MOpc = GET_ND_IF_ENABLED(X86::OR16rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::OR16rm);
         break;
       case ISD::XOR:
         ROpc = GET_ND_IF_ENABLED(X86::XOR16rr);
-        MOpc = GET_ND_IF_ENABLED(X86::XOR16rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::XOR16rm);
         break;
       }
       break;
@@ -5683,23 +5712,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       default: llvm_unreachable("Unexpected opcode!");
       case ISD::ADD:
         ROpc = GET_ND_IF_ENABLED(X86::ADD32rr);
-        MOpc = GET_ND_IF_ENABLED(X86::ADD32rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::ADD32rm);
         break;
       case ISD::SUB:
         ROpc = GET_ND_IF_ENABLED(X86::SUB32rr);
-        MOpc = GET_ND_IF_ENABLED(X86::SUB32rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::SUB32rm);
         break;
       case ISD::AND:
         ROpc = GET_ND_IF_ENABLED(X86::AND32rr);
-        MOpc = GET_ND_IF_ENABLED(X86::AND32rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::AND32rm);
         break;
       case ISD::OR:
         ROpc = GET_ND_IF_ENABLED(X86::OR32rr);
-        MOpc = GET_ND_IF_ENABLED(X86::OR32rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::OR32rm);
         break;
       case ISD::XOR:
         ROpc = GET_ND_IF_ENABLED(X86::XOR32rr);
-        MOpc = GET_ND_IF_ENABLED(X86::XOR32rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::XOR32rm);
         break;
       }
       break;
@@ -5708,23 +5737,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       default: llvm_unreachable("Unexpected opcode!");
       case ISD::ADD:
         ROpc = GET_ND_IF_ENABLED(X86::ADD64rr);
-        MOpc = GET_ND_IF_ENABLED(X86::ADD64rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::ADD64rm);
         break;
       case ISD::SUB:
         ROpc = GET_ND_IF_ENABLED(X86::SUB64rr);
-        MOpc = GET_ND_IF_ENABLED(X86::SUB64rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::SUB64rm);
         break;
       case ISD::AND:
         ROpc = GET_ND_IF_ENABLED(X86::AND64rr);
-        MOpc = GET_ND_IF_ENABLED(X86::AND64rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::AND64rm);
         break;
       case ISD::OR:
         ROpc = GET_ND_IF_ENABLED(X86::OR64rr);
-        MOpc = GET_ND_IF_ENABLED(X86::OR64rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::OR64rm);
         break;
       case ISD::XOR:
         ROpc = GET_ND_IF_ENABLED(X86::XOR64rr);
-        MOpc = GET_ND_IF_ENABLED(X86::XOR64rm);
+        MOpc = GET_NDM_IF_ENABLED(X86::XOR64rm);
         break;
       }
       break;
@@ -5796,6 +5825,11 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         std::swap(N0, N1);
     }
 
+    // UMUL/SMUL have an implicit source in LoReg (AL/AX/EAX/RAX). Prefer the
+    // operand that's already there to avoid an extra register-to-register move.
+    if (!FoldedLoad)
+      orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
+
     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
 
@@ -5882,23 +5916,10 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         std::swap(N0, N1);
     }
 
-    // For MULX, the implicit source must be in RDX (LoReg). If N1 is
-    // already a CopyFromReg of LoReg and N0 is not, flip so that N0
-    // (which feeds the CopyToReg below) is the operand already in LoReg,
-    // avoiding an unnecessary register-to-register copy before the multiply.
-    if (UseMULX && !foldedLoad) {
-      MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
-      auto GetPhysReg = [&](SDValue V) -> Register {
-        if (V.getOpcode() != ISD::CopyFromReg)
-          return Register();
-        Register Reg = cast<RegisterSDNode>(V.getOperand(1))->getReg();
-        if (Reg.isVirtual())
-          return MRI.getLiveInPhysReg(Reg);
-        return Reg;
-      };
-      if (GetPhysReg(N1) == LoReg && GetPhysReg(N0) != LoReg)
-        std::swap(N0, N1);
-    }
+    // UMUL/SMUL_LOHI has an implicit source in LoReg (RDX for MULX, RAX for
+    // MUL/IMUL). Prefer the operand that's already there.
+    if (!foldedLoad)
+      orderRegForMul(N0, N1, LoReg, CurDAG->getMachineFunction().getRegInfo());
 
     SDValue InGlue = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 62d8635ffc3eb..637388ced677a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10932,9 +10932,39 @@ static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                       DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
 }
 
+/// Return Mask with the necessary casting or extending
+/// for \p Mask according to \p MaskVT when lowering masking intrinsics
 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
-                           const SDLoc &dl);
+                           const SDLoc &dl) {
+  MVT SrcVT = Mask.getSimpleValueType();
+  assert(SrcVT.isScalarInteger() && "Expected scalar integer mask source!");
+  assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
+  assert(MaskVT.getVectorElementType() == MVT::i1 && "Bool vector expected!");
+
+  if (isAllOnesConstant(Mask))
+    return DAG.getConstant(1, dl, MaskVT);
+  if (X86::isZeroNode(Mask))
+    return DAG.getConstant(0, dl, MaskVT);
+
+  if (SrcVT == MVT::i64 && Subtarget.is32Bit()) {
+    assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
+    assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
+    // In case 32bit mode, bitcast i64 is illegal, extend/split it.
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
+    Lo = DAG.getBitcast(MVT::v32i1, Lo);
+    Hi = DAG.getBitcast(MVT::v32i1, Hi);
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
+  }
+
+  MVT BitcastVT = MVT::getVectorVT(MVT::i1, SrcVT.getSizeInBits());
+  // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
+  // are extracted by EXTRACT_SUBVECTOR.
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
+                     DAG.getBitcast(BitcastVT, Mask),
+                     DAG.getVectorIdxConstant(0, dl));
+}
 
 // X86 has dedicated shuffle that can be lowered to VEXPAND
 static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1,
@@ -14227,14 +14257,18 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                    Zeroable, Subtarget, DAG))
       return V;
 
-  if (Subtarget.hasSSE41() && !isSingleSHUFPSMask(Mask)) {
+  if (Subtarget.hasSSE41()) {
+    bool MatchesShufPS = isSingleSHUFPSMask(Mask);
+
     // Use INSERTPS if we can complete the shuffle efficiently.
-    if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
-      return V;
+    if (!MatchesShufPS || Zeroable == 0x3 || Zeroable == 0xC)
+      if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
+        return V;
 
-    if (SDValue BlendPerm =
-            lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
-      return BlendPerm;
+    if (!MatchesShufPS)
+      if (SDValue BlendPerm =
+              lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, V2, Mask, DAG))
+        return BlendPerm;
   }
 
   // Use low/high mov instructions. These are only valid in SSE1 because
@@ -26722,39 +26756,6 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
       MachinePointerInfo(SrcSV));
 }
 
-/// Return Mask with the necessary casting or extending
-/// for \p Mask according to \p MaskVT when lowering masking intrinsics
-static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
-                           const X86Subtarget &Subtarget, SelectionDAG &DAG,
-                           const SDLoc &dl) {
-
-  if (isAllOnesConstant(Mask))
-    return DAG.getConstant(1, dl, MaskVT);
-  if (X86::isZeroNode(Mask))
-    return DAG.getConstant(0, dl, MaskVT);
-
-  assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!");
-
-  if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
-    assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
-    assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
-    // In case 32bit mode, bitcast i64 is illegal, extend/split it.
-    SDValue Lo, Hi;
-    std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32);
-    Lo = DAG.getBitcast(MVT::v32i1, Lo);
-    Hi = DAG.getBitcast(MVT::v32i1, Hi);
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
-  } else {
-    MVT BitcastVT = MVT::getVectorVT(MVT::i1,
-                                     Mask.getSimpleValueType().getSizeInBits());
-    // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
-    // are extracted by EXTRACT_SUBVECTOR.
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
-                       DAG.getBitcast(BitcastVT, Mask),
-                       DAG.getVectorIdxConstant(0, dl));
-  }
-}
-
 /// Return (and \p Op, \p Mask) for compare instructions or
 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
 /// necessary casting or extending for \p Mask when lowering masking intrinsics
@@ -34479,10 +34480,18 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 
     if (auto *SrcC = dyn_cast<ConstantSDNode>(Src)) {
       // SHL(1,Amt) --> SELECT(1<<(Amt/64), SPLAT(1<<(Amt%64)), 0)
+      // SHL(-1,Amt)
+      // --> SELECT(1<<(Amt/64), SPLAT(-1<<(Amt%64)), (i512)-1<<(Amt&~63))
       // SRL(MSB,Amt) --> SELECT(MSB8>>u(Amt/64), SPLAT(MSB64>>u(Amt%64)), 0)
-      if ((Opc == ISD::SHL && SrcC->getAPIntValue() == 1) ||
-          (Opc == ISD::SRL && SrcC->getAPIntValue().isSignMask())) {
-        APInt EltBitVal = APInt::getOneBitSet(64, Opc == ISD::SHL ? 0 : 63);
+      // SRL(-1,Amt)
+      // --> SELECT(MSB8>>u(Amt/64), SPLAT(-1>>u(Amt%64)), (i512)-1>>(Amt&~63))
+      const APInt &SrcVal = SrcC->getAPIntValue();
+      bool IsAllBits = SrcVal.isAllOnes();
+      if ((Opc == ISD::SHL && (IsAllBits || SrcVal == 1)) ||
+          (Opc == ISD::SRL && (IsAllBits || SrcVal.isSignMask()))) {
+        APInt EltBitVal =
+            IsAllBits ? APInt::getAllOnes(64)
+                      : APInt::getOneBitSet(64, Opc == ISD::SHL ? 0 : 63);
         APInt LaneBitVal =
             APInt::getOneBitSet(64, Opc == ISD::SHL ? 0 : (NumElts - 1));
         SDValue EltBit = DAG.getConstant(EltBitVal, dl, MVT::i64);
@@ -34490,13 +34499,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         SDValue AmtMod = DAG.getNode(ISD::AND, dl, MVT::i64,
                                      DAG.getZExtOrTrunc(Amt, dl, MVT::i64),
                                      DAG.getConstant(63, dl, MVT::i64));
+
+        SDValue Res = DAG.getConstant(0, dl, VecVT);
+        if (IsAllBits) {
+          APInt AllBitsVal = Opc == ISD::SHL
+                                 ? APInt::getAllOnes(64)
+                                 : APInt::getLowBitsSet(64, NumElts);
+          SDValue AllBitsMask =
+              DAG.getNode(Opc, dl, MVT::i64,
+                          DAG.getConstant(AllBitsVal, dl, MVT::i64), AmtLane);
+          AllBitsMask = DAG.getBitcast(
+              BoolVT, DAG.getZExtOrTrunc(AllBitsMask, dl, MVT::i8));
+          Res = DAG.getSelect(dl, VecVT, AllBitsMask,
+                              DAG.getAllOnesConstant(dl, VecVT), Res);
+        }
+
         SDValue LaneMask = DAG.getNode(Opc, dl, MVT::i64, LaneBit, AmtLane);
         LaneMask =
             DAG.getBitcast(BoolVT, DAG.getZExtOrTrunc(LaneMask, dl, MVT::i8));
         SDValue Elt = DAG.getNode(Opc, dl, MVT::i64, EltBit, AmtMod);
-        SDValue Res =
-            DAG.getSelect(dl, VecVT, LaneMask, DAG.getSplat(VecVT, dl, Elt),
-                          DAG.getConstant(0, dl, VecVT));
+        Res = DAG.getSelect(dl, VecVT, LaneMask, DAG.getSplat(VecVT, dl, Elt),
+                            Res);
         Results.push_back(DAG.getBitcast(VT, Res));
         return;
       }
@@ -43407,6 +43430,15 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
 
     return SDValue();
   }
+  case X86ISD::COMPRESS: {
+    SDValue CmpVec = N.getOperand(0);
+    SDValue PassThru = N.getOperand(1);
+    // If CmpVec is splat value and identical to PassThru, then all elements are
+    // already in place.
+    if (CmpVec == PassThru && DAG.isSplatValue(CmpVec, /*AllowUndefs=*/false))
+      return CmpVec;
+    return SDValue();
+  }
   case X86ISD::EXPAND: {
     SDValue ExpVec = N.getOperand(0);
     SDValue PassThru = N.getOperand(1);
@@ -45823,6 +45855,14 @@ bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op,
   case X86ISD::VBROADCAST_LOAD:
     UndefElts = APInt::getZero(NumElts);
     return true;
+  case X86ISD::VSHL:
+  case X86ISD::VSRA:
+  case X86ISD::VSRL:
+  case X86ISD::VSHLI:
+  case X86ISD::VSRAI:
+  case X86ISD::VSRLI:
+    return DAG.isSplatValue(Op.getOperand(0), DemandedElts, UndefElts,
+                            Depth + 1);
   }
 
   return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts,
@@ -60053,6 +60093,33 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
         }
       }
       break;
+    case X86ISD::VTRUNCS:
+    case X86ISD::VTRUNCUS:
+      if (!IsSplat && NumOps == 2 && VT.is512BitVector() &&
+          Subtarget.useBWIRegs()) {
+        MVT SrcVT = Ops[0].getOperand(0).getSimpleValueType();
+        if (SrcVT.is512BitVector() &&
+            SrcVT == Ops[1].getOperand(0).getSimpleValueType() &&
+            SrcVT.getScalarSizeInBits() <= 32 &&
+            (VT.getScalarSizeInBits() * 2 == SrcVT.getScalarSizeInBits())) {
+          using namespace SDPatternMatch;
+          SDValue N0 = Ops[0].getOperand(0), N1 = Ops[1].getOperand(0);
+          if (Opcode == X86ISD::VTRUNCS ||
+              (sd_match(N0, m_SMaxLike(m_Value(N0), m_Zero())) &&
+               sd_match(N1, m_SMaxLike(m_Value(N1), m_Zero())))) {
+            N0 = DAG.getBitcast(MVT::v8i64, N0);
+            N1 = DAG.getBitcast(MVT::v8i64, N1);
+            SDValue LHS = DAG.getVectorShuffle(MVT::v8i64, DL, N0, N1,
+                                               {0, 1, 4, 5, 8, 9, 12, 13});
+            SDValue RHS = DAG.getVectorShuffle(MVT::v8i64, DL, N0, N1,
+                                               {2, 3, 6, 7, 10, 11, 14, 15});
+            return DAG.getNode(
+                Opcode == X86ISD::VTRUNCS ? X86ISD::PACKSS : X86ISD::PACKUS, DL,
+                VT, DAG.getBitcast(SrcVT, LHS), DAG.getBitcast(SrcVT, RHS));
+          }
+        }
+      }
+      break;
     case ISD::ANY_EXTEND:
     case ISD::SIGN_EXTEND:
     case ISD::ZERO_EXTEND:
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index aafc5cb3c7ee5..400cdcb9251de 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -6145,11 +6145,17 @@ let Predicates = [HasAVX512, NoVLX] in {
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
                  timm:$src2)), sub_ymm)>;
 
+  def : Pat<(v4i64 (X86vsrai (v4i64 (X86VBroadcastld64 addr:$src1)), (i8 timm:$src2))),
+            (EXTRACT_SUBREG (v8i64 (VPSRAQZmbi addr:$src1, timm:$src2)), sub_ymm)>;
+
   def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 timm:$src2))),
             (EXTRACT_SUBREG (v8i64
               (VPSRAQZri
                 (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
                  timm:$src2)), sub_xmm)>;
+
+  def : Pat<(v2i64 (X86vsrai (v2i64 (X86VBroadcastld64 addr:$src1)), (i8 timm:$src2))),
+            (EXTRACT_SUBREG (v8i64 (VPSRAQZmbi addr:$src1, timm:$src2)), sub_xmm)>;
 }
 
 //===-------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 031fdc1e7162c..5fe2f04c30b14 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -429,20 +429,24 @@ let isConvertibleToThreeAddress = 1 in {
     def INC8r  : IncOpR_RF<Xi8>;
     def INC16r : IncOpR_RF<Xi16>, OpSize16;
     def INC32r : IncOpR_RF<Xi32>, OpSize32;
-    def INC64r : IncOpR_RF<Xi64>;
     def DEC8r  : DecOpR_RF<Xi8>;
     def DEC16r : DecOpR_RF<Xi16>, OpSize16;
     def DEC32r : DecOpR_RF<Xi32>, OpSize32;
+  }
+  let Predicates = [NoNDDI] in {
+    def INC64r : IncOpR_RF<Xi64>;
     def DEC64r : DecOpR_RF<Xi64>;
   }
   let Predicates = [HasNDD, In64BitMode] in {
     def INC8r_ND  : IncOpR_RF<Xi8, 1>;
     def INC16r_ND : IncOpR_RF<Xi16, 1>, PD;
     def INC32r_ND : IncOpR_RF<Xi32, 1>;
-    def INC64r_ND : IncOpR_RF<Xi64, 1>;
     def DEC8r_ND  : DecOpR_RF<Xi8, 1>;
     def DEC16r_ND : DecOpR_RF<Xi16, 1>, PD;
     def DEC32r_ND : DecOpR_RF<Xi32, 1>;
+  }
+  let Predicates = [HasNDDI, In64BitMode] in {
+    def INC64r_ND : IncOpR_RF<Xi64, 1>;
     def DEC64r_ND : DecOpR_RF<Xi64, 1>;
   }
   let Predicates = [In64BitMode], Pattern = [(null_frag)] in {
@@ -635,7 +639,8 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
                          string mnemonic, Format RegMRM, Format MemMRM,
                          SDNode opnodeflag, SDNode opnode,
                          bit CommutableRR, bit ConvertibleToThreeAddress,
-                         bit ConvertibleToThreeAddressRR> {
+                         bit ConvertibleToThreeAddressRR,
+                         Predicate prd = HasNDD> {
   let isCommutable = CommutableRR,
       isConvertibleToThreeAddress = ConvertibleToThreeAddressRR in {
     let Predicates = [NoNDD] in {
@@ -722,26 +727,30 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       // first so that they are slightly preferred to the ri forms.
       def 16ri8 : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM>, OpSize16;
       def 32ri8 : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM>, OpSize32;
-      def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
       def 8ri   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
       def 16ri  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM>, OpSize16;
       def 32ri  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM>, OpSize32;
+    }
+    let Predicates = [NoNDDI] in {
+      def 64ri8 : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM>;
       def 64ri32: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM>;
     }
     let Predicates = [HasNDD, In64BitMode] in {
       def 16ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi16, RegMRM, 1>, PD;
       def 32ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi32, RegMRM, 1>;
-      def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
       def 8ri_ND   : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM, 1>;
       def 16ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi16, opnodeflag, RegMRM, 1>, PD;
       def 32ri_ND  : BinOpRI_RF<0x81, mnemonic, Xi32, opnodeflag, RegMRM, 1>;
-      def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
       def 16ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
       def 32ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
-      def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
       def 8ri_NF_ND  : BinOpRI_R<0x80, mnemonic, Xi8, RegMRM, 1>, EVEX_NF;
       def 16ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi16, RegMRM, 1>, EVEX_NF, PD;
       def 32ri_NF_ND : BinOpRI_R<0x81, mnemonic, Xi32, RegMRM, 1>, EVEX_NF;
+    }
+    let Predicates = [prd, In64BitMode] in {
+      def 64ri8_ND : BinOpRI8_RF<0x83, mnemonic, Xi64, RegMRM, 1>;
+      def 64ri32_ND: BinOpRI_RF<0x81, mnemonic, Xi64, opnodeflag, RegMRM, 1>;
+      def 64ri8_NF_ND : BinOpRI8_R<0x83, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
       def 64ri32_NF_ND : BinOpRI_R<0x81, mnemonic, Xi64, RegMRM, 1>, EVEX_NF;
     }
     let Predicates = [In64BitMode] in {
@@ -1094,10 +1103,10 @@ defm OR  : ArithBinOp_RF<0x09, 0x0B, 0x0D, "or", MRM1r, MRM1m,
 defm XOR : ArithBinOp_RF<0x31, 0x33, 0x35, "xor", MRM6r, MRM6m,
                          X86xor_flag, xor, 1, 0, 0>;
 defm ADD : ArithBinOp_RF<0x01, 0x03, 0x05, "add", MRM0r, MRM0m,
-                         X86add_flag, add, 1, 1, 1>;
+                         X86add_flag, add, 1, 1, 1, HasNDDI>;
 let isCompare = 1 in {
   defm SUB : ArithBinOp_RF<0x29, 0x2B, 0x2D, "sub", MRM5r, MRM5m,
-                           X86sub_flag, sub, 0, 1, 0>;
+                           X86sub_flag, sub, 0, 1, 0, HasNDDI>;
 }
 
 // Version of XOR8rr_NOREX that use GR8_NOREX. This is used by the handling of
@@ -1132,14 +1141,14 @@ let Predicates = [NoNDD] in {
             (ADC64rm GR64:$src1, addr:$src2)>;
 }
 let Predicates = [HasNDD] in {
-  def : Pat<(X86adc_flag (loadi8 addr:$src2), GR8:$src1, EFLAGS),
-            (ADC8rm_ND GR8:$src1, addr:$src2)>;
-  def : Pat<(X86adc_flag (loadi16 addr:$src2), GR16:$src1, EFLAGS),
-            (ADC16rm_ND GR16:$src1, addr:$src2)>;
-  def : Pat<(X86adc_flag (loadi32 addr:$src2), GR32:$src1, EFLAGS),
-            (ADC32rm_ND GR32:$src1, addr:$src2)>;
-  def : Pat<(X86adc_flag (loadi64 addr:$src2), GR64:$src1, EFLAGS),
-            (ADC64rm_ND GR64:$src1, addr:$src2)>;
+  def : Pat<(X86adc_flag (loadi8 ndd_addr:$src2), GR8:$src1, EFLAGS),
+            (ADC8rm_ND GR8:$src1, ndd_addr:$src2)>;
+  def : Pat<(X86adc_flag (loadi16 ndd_addr:$src2), GR16:$src1, EFLAGS),
+            (ADC16rm_ND GR16:$src1, ndd_addr:$src2)>;
+  def : Pat<(X86adc_flag (loadi32 ndd_addr:$src2), GR32:$src1, EFLAGS),
+            (ADC32rm_ND GR32:$src1, ndd_addr:$src2)>;
+  def : Pat<(X86adc_flag (loadi64 ndd_addr:$src2), GR64:$src1, EFLAGS),
+            (ADC64rm_ND GR64:$src1, ndd_addr:$src2)>;
 }
 
 // Patterns to recognize RMW ADC with loads in operand 1.
@@ -1157,7 +1166,8 @@ def : Pat<(store (X86adc_flag GR64:$src, (loadi64 addr:$dst), EFLAGS),
           (ADC64mr addr:$dst, GR64:$src)>;
 
 // Patterns for basic arithmetic ops with relocImm for the immediate field.
-multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode> {
+multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode,
+                                       Predicate prd=HasNDD> {
   let Predicates = [NoNDD] in {
     def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2),
               (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
@@ -1165,8 +1175,6 @@ multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode> {
               (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
     def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
               (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
-    def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
-              (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
 
     def : Pat<(store (OpNode (load addr:$dst), relocImm8_su:$src), addr:$dst),
               (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>;
@@ -1184,18 +1192,24 @@ multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode> {
               (!cast<Instruction>(NAME#"16ri_ND") GR16:$src1, relocImm16_su:$src2)>;
     def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
               (!cast<Instruction>(NAME#"32ri_ND") GR32:$src1, relocImm32_su:$src2)>;
-    def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
-              (!cast<Instruction>(NAME#"64ri32_ND") GR64:$src1, i64relocImmSExt32_su:$src2)>;
 
-    def : Pat<(OpNode (load addr:$dst), relocImm8_su:$src),
-              (!cast<Instruction>(NAME#"8mi_ND") addr:$dst, relocImm8_su:$src)>;
-    def : Pat<(OpNode (load addr:$dst), relocImm16_su:$src),
-              (!cast<Instruction>(NAME#"16mi_ND") addr:$dst, relocImm16_su:$src)>;
-    def : Pat<(OpNode (load addr:$dst), relocImm32_su:$src),
-              (!cast<Instruction>(NAME#"32mi_ND") addr:$dst, relocImm32_su:$src)>;
-    def : Pat<(OpNode (load addr:$dst), i64relocImmSExt32_su:$src),
-              (!cast<Instruction>(NAME#"64mi32_ND") addr:$dst, i64relocImmSExt32_su:$src)>;
+    def : Pat<(OpNode (load ndd_addr:$dst), relocImm8_su:$src),
+              (!cast<Instruction>(NAME#"8mi_ND") ndd_addr:$dst, relocImm8_su:$src)>;
+    def : Pat<(OpNode (load ndd_addr:$dst), relocImm16_su:$src),
+              (!cast<Instruction>(NAME#"16mi_ND") ndd_addr:$dst, relocImm16_su:$src)>;
+    def : Pat<(OpNode (load ndd_addr:$dst), relocImm32_su:$src),
+              (!cast<Instruction>(NAME#"32mi_ND") ndd_addr:$dst, relocImm32_su:$src)>;
+    def : Pat<(OpNode (load ndd_addr:$dst), i64relocImmSExt32_su:$src),
+              (!cast<Instruction>(NAME#"64mi32_ND") ndd_addr:$dst, i64relocImmSExt32_su:$src)>;
   }
+
+  let Predicates = [NoNDDI] in
+    def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+              (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+  let Predicates = [prd] in
+    def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+              (!cast<Instruction>(NAME#"64ri32_ND") GR64:$src1, i64relocImmSExt32_su:$src2)>;
 }
 
 multiclass ArithBinOp_RFF_relocImm_Pats<SDNode OpNodeFlag> {
@@ -1228,14 +1242,14 @@ multiclass ArithBinOp_RFF_relocImm_Pats<SDNode OpNodeFlag> {
     def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2, EFLAGS),
               (!cast<Instruction>(NAME#"64ri32_ND") GR64:$src1, i64relocImmSExt32_su:$src2)>;
 
-    def : Pat<(OpNodeFlag (load addr:$dst), relocImm8_su:$src, EFLAGS),
-              (!cast<Instruction>(NAME#"8mi_ND") addr:$dst, relocImm8_su:$src)>;
-    def : Pat<(OpNodeFlag (load addr:$dst), relocImm16_su:$src, EFLAGS),
-              (!cast<Instruction>(NAME#"16mi_ND") addr:$dst, relocImm16_su:$src)>;
-    def : Pat<(OpNodeFlag (load addr:$dst), relocImm32_su:$src, EFLAGS),
-              (!cast<Instruction>(NAME#"32mi_ND") addr:$dst, relocImm32_su:$src)>;
-    def : Pat<(OpNodeFlag (load addr:$dst), i64relocImmSExt32_su:$src, EFLAGS),
-              (!cast<Instruction>(NAME#"64mi32_ND") addr:$dst, i64relocImmSExt32_su:$src)>;
+    def : Pat<(OpNodeFlag (load ndd_addr:$dst), relocImm8_su:$src, EFLAGS),
+              (!cast<Instruction>(NAME#"8mi_ND") ndd_addr:$dst, relocImm8_su:$src)>;
+    def : Pat<(OpNodeFlag (load ndd_addr:$dst), relocImm16_su:$src, EFLAGS),
+              (!cast<Instruction>(NAME#"16mi_ND") ndd_addr:$dst, relocImm16_su:$src)>;
+    def : Pat<(OpNodeFlag (load ndd_addr:$dst), relocImm32_su:$src, EFLAGS),
+              (!cast<Instruction>(NAME#"32mi_ND") ndd_addr:$dst, relocImm32_su:$src)>;
+    def : Pat<(OpNodeFlag (load ndd_addr:$dst), i64relocImmSExt32_su:$src, EFLAGS),
+              (!cast<Instruction>(NAME#"64mi32_ND") ndd_addr:$dst, i64relocImmSExt32_su:$src)>;
   }
 }
 
@@ -1262,8 +1276,8 @@ multiclass ArithBinOp_F_relocImm_Pats<SDNode OpNodeFlag> {
 defm AND : ArithBinOp_RF_relocImm_Pats<X86and_flag, and>;
 defm OR  : ArithBinOp_RF_relocImm_Pats<X86or_flag, or>;
 defm XOR : ArithBinOp_RF_relocImm_Pats<X86xor_flag, xor>;
-defm ADD : ArithBinOp_RF_relocImm_Pats<X86add_flag, add>;
-defm SUB : ArithBinOp_RF_relocImm_Pats<X86sub_flag, sub>;
+defm ADD : ArithBinOp_RF_relocImm_Pats<X86add_flag, add, HasNDDI>;
+defm SUB : ArithBinOp_RF_relocImm_Pats<X86sub_flag, sub, HasNDDI>;
 
 defm ADC : ArithBinOp_RFF_relocImm_Pats<X86adc_flag>;
 defm SBB : ArithBinOp_RFF_relocImm_Pats<X86sbb_flag>;
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index 77a9c7a1f585f..39663049b92bf 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -25,7 +25,7 @@ let SchedRW = [WriteCMOV.Folded, WriteCMOV.ReadAfterFold] in
                       (ins t.RegClass:$src1, t.MemOperand:$src2, ccode:$cond),
                       "cmov${cond}", args,
                       [(set t.RegClass:$dst, (X86cmov t.RegClass:$src1,
-                                    (t.LoadNode addr:$src2), timm:$cond, EFLAGS))]>, UseEFLAGS, NDD<ndd>;
+                                    (t.LoadNode !if(ndd, ndd_addr, addr):$src2), timm:$cond, EFLAGS))]>, UseEFLAGS, NDD<ndd>;
 }
 
 multiclass Cfcmov<X86TypeInfo t> {
@@ -99,12 +99,12 @@ let Predicates = [HasCMOV, NoNDD] in {
 }
 
 let Predicates = [HasCMOV, HasNDD] in {
-  def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
-            (CMOV16rm_ND GR16:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
-  def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
-            (CMOV32rm_ND GR32:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
-  def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
-            (CMOV64rm_ND GR64:$src2, addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi16 ndd_addr:$src1), GR16:$src2, timm:$cond, EFLAGS),
+            (CMOV16rm_ND GR16:$src2, ndd_addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi32 ndd_addr:$src1), GR32:$src2, timm:$cond, EFLAGS),
+            (CMOV32rm_ND GR32:$src2, ndd_addr:$src1, (inv_cond_XFORM timm:$cond))>;
+  def : Pat<(X86cmov (loadi64 ndd_addr:$src1), GR64:$src2, timm:$cond, EFLAGS),
+            (CMOV64rm_ND GR64:$src2, ndd_addr:$src1, (inv_cond_XFORM timm:$cond))>;
 }
 let Predicates = [HasCMOV, HasCF] in {
   def : Pat<(X86cmov GR16:$src1, 0, timm:$cond, EFLAGS),
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 65188c0295f8c..bc05dae7351bb 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1426,6 +1426,14 @@ def : Pat<(X86tcret (i64 texternalsym:$dst), timm:$off),
           (TCRETURNdi64 texternalsym:$dst, timm:$off)>,
           Requires<[IsLP64]>;
 
+def : Pat<(X86tcret (i64 (X86Wrapper tglobaladdr:$dst)), timm:$off),
+          (TCRETURNdi64 tglobaladdr:$dst, timm:$off)>,
+          Requires<[IsNotPIC, HasJMPABS]>;
+
+def : Pat<(X86tcret (i64 (X86Wrapper texternalsym:$dst)), timm:$off),
+          (TCRETURNdi64 texternalsym:$dst, timm:$off)>,
+          Requires<[IsNotPIC, HasJMPABS]>;
+
 // Normal calls, with various flavors of addresses.
 def : Pat<(X86call (i32 tglobaladdr:$dst)),
           (CALLpcrel32 tglobaladdr:$dst)>;
@@ -1587,13 +1595,15 @@ let Predicates = [NoNDD] in {
             (SUB16ri GR16:$src1, -128)>;
   def : Pat<(add GR32:$src1, 128),
             (SUB32ri GR32:$src1, -128)>;
-  def : Pat<(add GR64:$src1, 128),
-            (SUB64ri32 GR64:$src1, -128)>;
 
   def : Pat<(X86add_flag_nocf GR16:$src1, 128),
             (SUB16ri GR16:$src1, -128)>;
   def : Pat<(X86add_flag_nocf GR32:$src1, 128),
             (SUB32ri GR32:$src1, -128)>;
+}
+let Predicates = [NoNDDI] in {
+  def : Pat<(add GR64:$src1, 128),
+            (SUB64ri32 GR64:$src1, -128)>;
   def : Pat<(X86add_flag_nocf GR64:$src1, 128),
             (SUB64ri32 GR64:$src1, -128)>;
 }
@@ -1602,13 +1612,15 @@ let Predicates = [HasNDD] in {
             (SUB16ri_ND GR16:$src1, -128)>;
   def : Pat<(add GR32:$src1, 128),
             (SUB32ri_ND GR32:$src1, -128)>;
-  def : Pat<(add GR64:$src1, 128),
-            (SUB64ri32_ND GR64:$src1, -128)>;
 
   def : Pat<(X86add_flag_nocf GR16:$src1, 128),
             (SUB16ri_ND GR16:$src1, -128)>;
   def : Pat<(X86add_flag_nocf GR32:$src1, 128),
             (SUB32ri_ND GR32:$src1, -128)>;
+}
+let Predicates = [HasNDDI] in {
+  def : Pat<(add GR64:$src1, 128),
+            (SUB64ri32_ND GR64:$src1, -128)>;
   def : Pat<(X86add_flag_nocf GR64:$src1, 128),
             (SUB64ri32_ND GR64:$src1, -128)>;
 }
@@ -1619,12 +1631,12 @@ def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
 def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
           (SUB64mi32 addr:$dst, -128)>;
 let Predicates = [HasNDD] in {
-  def : Pat<(add (loadi16 addr:$src), 128),
-            (SUB16mi_ND addr:$src, -128)>;
-  def : Pat<(add (loadi32 addr:$src), 128),
-            (SUB32mi_ND addr:$src, -128)>;
-  def : Pat<(add (loadi64 addr:$src), 128),
-            (SUB64mi32_ND addr:$src, -128)>;
+  def : Pat<(add (loadi16 ndd_addr:$src), 128),
+            (SUB16mi_ND ndd_addr:$src, -128)>;
+  def : Pat<(add (loadi32 ndd_addr:$src), 128),
+            (SUB32mi_ND ndd_addr:$src, -128)>;
+  def : Pat<(add (loadi64 ndd_addr:$src), 128),
+            (SUB64mi32_ND ndd_addr:$src, -128)>;
 }
 
 // The same trick applies for 32-bit immediate fields in 64-bit
@@ -1635,7 +1647,7 @@ let Predicates = [NoNDD] in {
   def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
             (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
 }
-let Predicates = [HasNDD] in {
+let Predicates = [HasNDDI] in {
   def : Pat<(add GR64:$src1, 0x0000000080000000),
             (SUB64ri32_ND GR64:$src1, 0xffffffff80000000)>;
   def : Pat<(X86add_flag_nocf GR64:$src1, 0x0000000080000000),
@@ -1644,8 +1656,8 @@ let Predicates = [HasNDD] in {
 def : Pat<(store (add (loadi64 addr:$dst), 0x0000000080000000), addr:$dst),
           (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
 let Predicates = [HasNDD] in {
-  def : Pat<(add(loadi64 addr:$src), 0x0000000080000000),
-            (SUB64mi32_ND addr:$src, 0xffffffff80000000)>;
+  def : Pat<(add(loadi64 ndd_addr:$src), 0x0000000080000000),
+            (SUB64mi32_ND ndd_addr:$src, 0xffffffff80000000)>;
 }
 
 // Depositing value to 8/16 bit subreg:
@@ -1937,14 +1949,14 @@ multiclass MaskedShiftAmountPats<SDNode frag> {
             (!cast<Instruction>(NAME # "64mCL") addr:$dst)>;
 
   let Predicates = [HasNDD] in {
-    def : Pat<(frag (loadi8 addr:$src), (shiftMask32 CL)),
-              (!cast<Instruction>(NAME # "8mCL_ND") addr:$src)>;
-    def : Pat<(frag (loadi16 addr:$src), (shiftMask32 CL)),
-              (!cast<Instruction>(NAME # "16mCL_ND") addr:$src)>;
-    def : Pat<(frag (loadi32 addr:$src), (shiftMask32 CL)),
-              (!cast<Instruction>(NAME # "32mCL_ND") addr:$src)>;
-    def : Pat<(frag (loadi64 addr:$src), (shiftMask64 CL)),
-              (!cast<Instruction>(NAME # "64mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi8 ndd_addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "8mCL_ND") ndd_addr:$src)>;
+    def : Pat<(frag (loadi16 ndd_addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "16mCL_ND") ndd_addr:$src)>;
+    def : Pat<(frag (loadi32 ndd_addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32mCL_ND") ndd_addr:$src)>;
+    def : Pat<(frag (loadi64 ndd_addr:$src), (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64mCL_ND") ndd_addr:$src)>;
   }
 }
 
@@ -1991,14 +2003,14 @@ multiclass MaskedRotateAmountPats<SDNode frag> {
             (!cast<Instruction>(NAME # "64mCL") addr:$dst)>;
 
   let Predicates = [HasNDD] in {
-    def : Pat<(frag (loadi8 addr:$src), (shiftMask8 CL)),
-              (!cast<Instruction>(NAME # "8mCL_ND") addr:$src)>;
-    def : Pat<(frag (loadi16 addr:$src), (shiftMask16 CL)),
-              (!cast<Instruction>(NAME # "16mCL_ND") addr:$src)>;
-    def : Pat<(frag (loadi32 addr:$src), (shiftMask32 CL)),
-              (!cast<Instruction>(NAME # "32mCL_ND") addr:$src)>;
-    def : Pat<(frag (loadi64 addr:$src), (shiftMask64 CL)),
-              (!cast<Instruction>(NAME # "64mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi8 ndd_addr:$src), (shiftMask8 CL)),
+              (!cast<Instruction>(NAME # "8mCL_ND") ndd_addr:$src)>;
+    def : Pat<(frag (loadi16 ndd_addr:$src), (shiftMask16 CL)),
+              (!cast<Instruction>(NAME # "16mCL_ND") ndd_addr:$src)>;
+    def : Pat<(frag (loadi32 ndd_addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32mCL_ND") ndd_addr:$src)>;
+    def : Pat<(frag (loadi64 ndd_addr:$src), (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64mCL_ND") ndd_addr:$src)>;
   }
 }
 
@@ -2064,8 +2076,8 @@ defm : OneBitPats<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
-  let Predicates = [p] in {
+multiclass EFLAGSDefiningPats<string suffix, Predicate PredNDD, Predicate PredNDDI, bit ndd> {
+  let Predicates = [PredNDD] in {
     // add reg, reg
     def : Pat<(add GR8 :$src1, GR8 :$src2), (!cast<Instruction>(ADD8rr#suffix) GR8 :$src1, GR8 :$src2)>;
     def : Pat<(add GR16:$src1, GR16:$src2), (!cast<Instruction>(ADD16rr#suffix) GR16:$src1, GR16:$src2)>;
@@ -2073,20 +2085,19 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(add GR64:$src1, GR64:$src2), (!cast<Instruction>(ADD64rr#suffix) GR64:$src1, GR64:$src2)>;
 
     // add reg, mem
-    def : Pat<(add GR8:$src1, (loadi8 addr:$src2)),
-              (!cast<Instruction>(ADD8rm#suffix) GR8:$src1, addr:$src2)>;
-    def : Pat<(add GR16:$src1, (loadi16 addr:$src2)),
-              (!cast<Instruction>(ADD16rm#suffix) GR16:$src1, addr:$src2)>;
-    def : Pat<(add GR32:$src1, (loadi32 addr:$src2)),
-              (!cast<Instruction>(ADD32rm#suffix) GR32:$src1, addr:$src2)>;
-    def : Pat<(add GR64:$src1, (loadi64 addr:$src2)),
-              (!cast<Instruction>(ADD64rm#suffix) GR64:$src1, addr:$src2)>;
+    def : Pat<(add GR8:$src1, (loadi8 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(ADD8rm#suffix) GR8:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(add GR16:$src1, (loadi16 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(ADD16rm#suffix) GR16:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(add GR32:$src1, (loadi32 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(ADD32rm#suffix) GR32:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(add GR64:$src1, (loadi64 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(ADD64rm#suffix) GR64:$src1, !if(ndd, ndd_addr, addr):$src2)>;
 
     // add reg, imm
     def : Pat<(add GR8 :$src1, imm:$src2), (!cast<Instruction>(ADD8ri#suffix) GR8:$src1 , imm:$src2)>;
     def : Pat<(add GR16:$src1, imm:$src2), (!cast<Instruction>(ADD16ri#suffix) GR16:$src1, imm:$src2)>;
     def : Pat<(add GR32:$src1, imm:$src2), (!cast<Instruction>(ADD32ri#suffix) GR32:$src1, imm:$src2)>;
-    def : Pat<(add GR64:$src1, i64immSExt32:$src2), (!cast<Instruction>(ADD64ri32#suffix) GR64:$src1, i64immSExt32:$src2)>;
 
     // sub reg, reg
     def : Pat<(sub GR8 :$src1, GR8 :$src2), (!cast<Instruction>(SUB8rr#suffix)  GR8 :$src1, GR8 :$src2)>;
@@ -2095,14 +2106,14 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(sub GR64:$src1, GR64:$src2), (!cast<Instruction>(SUB64rr#suffix) GR64:$src1, GR64:$src2)>;
 
     // sub reg, mem
-    def : Pat<(sub GR8:$src1, (loadi8 addr:$src2)),
-              (!cast<Instruction>(SUB8rm#suffix) GR8:$src1, addr:$src2)>;
-    def : Pat<(sub GR16:$src1, (loadi16 addr:$src2)),
-              (!cast<Instruction>(SUB16rm#suffix) GR16:$src1, addr:$src2)>;
-    def : Pat<(sub GR32:$src1, (loadi32 addr:$src2)),
-              (!cast<Instruction>(SUB32rm#suffix) GR32:$src1, addr:$src2)>;
-    def : Pat<(sub GR64:$src1, (loadi64 addr:$src2)),
-              (!cast<Instruction>(SUB64rm#suffix) GR64:$src1, addr:$src2)>;
+    def : Pat<(sub GR8:$src1, (loadi8 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(SUB8rm#suffix) GR8:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(sub GR16:$src1, (loadi16 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(SUB16rm#suffix) GR16:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(sub GR32:$src1, (loadi32 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(SUB32rm#suffix) GR32:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(sub GR64:$src1, (loadi64 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(SUB64rm#suffix) GR64:$src1, !if(ndd, ndd_addr, addr):$src2)>;
 
     // sub reg, imm
     def : Pat<(sub GR8:$src1, imm:$src2),
@@ -2111,8 +2122,6 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
               (!cast<Instruction>(SUB16ri#suffix) GR16:$src1, imm:$src2)>;
     def : Pat<(sub GR32:$src1, imm:$src2),
               (!cast<Instruction>(SUB32ri#suffix) GR32:$src1, imm:$src2)>;
-    def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
-              (!cast<Instruction>(SUB64ri32#suffix) GR64:$src1, i64immSExt32:$src2)>;
 
     // sub 0, reg
     def : Pat<(X86sub_flag 0, GR8 :$src), (!cast<Instruction>(NEG8r#suffix)  GR8 :$src)>;
@@ -2129,12 +2138,12 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
               (!cast<Instruction>(IMUL64rr#suffix) GR64:$src1, GR64:$src2)>;
 
     // mul reg, mem
-    def : Pat<(mul GR16:$src1, (loadi16 addr:$src2)),
-              (!cast<Instruction>(IMUL16rm#suffix) GR16:$src1, addr:$src2)>;
-    def : Pat<(mul GR32:$src1, (loadi32 addr:$src2)),
-              (!cast<Instruction>(IMUL32rm#suffix) GR32:$src1, addr:$src2)>;
-    def : Pat<(mul GR64:$src1, (loadi64 addr:$src2)),
-              (!cast<Instruction>(IMUL64rm#suffix) GR64:$src1, addr:$src2)>;
+    def : Pat<(mul GR16:$src1, (loadi16 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(IMUL16rm#suffix) GR16:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(mul GR32:$src1, (loadi32 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(IMUL32rm#suffix) GR32:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(mul GR64:$src1, (loadi64 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(IMUL64rm#suffix) GR64:$src1, !if(ndd, ndd_addr, addr):$src2)>;
 
     // or reg/reg.
     def : Pat<(or GR8 :$src1, GR8 :$src2), (!cast<Instruction>(OR8rr#suffix)  GR8 :$src1, GR8 :$src2)>;
@@ -2143,14 +2152,14 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(or GR64:$src1, GR64:$src2), (!cast<Instruction>(OR64rr#suffix) GR64:$src1, GR64:$src2)>;
 
     // or reg/mem
-    def : Pat<(or GR8:$src1, (loadi8 addr:$src2)),
-              (!cast<Instruction>(OR8rm#suffix) GR8:$src1, addr:$src2)>;
-    def : Pat<(or GR16:$src1, (loadi16 addr:$src2)),
-              (!cast<Instruction>(OR16rm#suffix) GR16:$src1, addr:$src2)>;
-    def : Pat<(or GR32:$src1, (loadi32 addr:$src2)),
-              (!cast<Instruction>(OR32rm#suffix) GR32:$src1, addr:$src2)>;
-    def : Pat<(or GR64:$src1, (loadi64 addr:$src2)),
-              (!cast<Instruction>(OR64rm#suffix) GR64:$src1, addr:$src2)>;
+    def : Pat<(or GR8:$src1, (loadi8 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(OR8rm#suffix) GR8:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(or GR16:$src1, (loadi16 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(OR16rm#suffix) GR16:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(or GR32:$src1, (loadi32 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(OR32rm#suffix) GR32:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(or GR64:$src1, (loadi64 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(OR64rm#suffix) GR64:$src1, !if(ndd, ndd_addr, addr):$src2)>;
 
     // or reg/imm
     def : Pat<(or GR8:$src1 , imm:$src2), (!cast<Instruction>(OR8ri#suffix)  GR8 :$src1, imm:$src2)>;
@@ -2166,14 +2175,14 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(xor GR64:$src1, GR64:$src2), (!cast<Instruction>(XOR64rr#suffix) GR64:$src1, GR64:$src2)>;
 
     // xor reg/mem
-    def : Pat<(xor GR8:$src1, (loadi8 addr:$src2)),
-              (!cast<Instruction>(XOR8rm#suffix) GR8:$src1, addr:$src2)>;
-    def : Pat<(xor GR16:$src1, (loadi16 addr:$src2)),
-              (!cast<Instruction>(XOR16rm#suffix) GR16:$src1, addr:$src2)>;
-    def : Pat<(xor GR32:$src1, (loadi32 addr:$src2)),
-              (!cast<Instruction>(XOR32rm#suffix) GR32:$src1, addr:$src2)>;
-    def : Pat<(xor GR64:$src1, (loadi64 addr:$src2)),
-              (!cast<Instruction>(XOR64rm#suffix) GR64:$src1, addr:$src2)>;
+    def : Pat<(xor GR8:$src1, (loadi8 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(XOR8rm#suffix) GR8:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(xor GR16:$src1, (loadi16 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(XOR16rm#suffix) GR16:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(xor GR32:$src1, (loadi32 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(XOR32rm#suffix) GR32:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(xor GR64:$src1, (loadi64 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(XOR64rm#suffix) GR64:$src1, !if(ndd, ndd_addr, addr):$src2)>;
 
     // xor reg/imm
     def : Pat<(xor GR8:$src1, imm:$src2),
@@ -2192,14 +2201,14 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
     def : Pat<(and GR64:$src1, GR64:$src2), (!cast<Instruction>(AND64rr#suffix) GR64:$src1, GR64:$src2)>;
 
     // and reg/mem
-    def : Pat<(and GR8:$src1, (loadi8 addr:$src2)),
-              (!cast<Instruction>(AND8rm#suffix) GR8:$src1, addr:$src2)>;
-    def : Pat<(and GR16:$src1, (loadi16 addr:$src2)),
-              (!cast<Instruction>(AND16rm#suffix) GR16:$src1, addr:$src2)>;
-    def : Pat<(and GR32:$src1, (loadi32 addr:$src2)),
-              (!cast<Instruction>(AND32rm#suffix) GR32:$src1, addr:$src2)>;
-    def : Pat<(and GR64:$src1, (loadi64 addr:$src2)),
-              (!cast<Instruction>(AND64rm#suffix) GR64:$src1, addr:$src2)>;
+    def : Pat<(and GR8:$src1, (loadi8 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(AND8rm#suffix) GR8:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(and GR16:$src1, (loadi16 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(AND16rm#suffix) GR16:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(and GR32:$src1, (loadi32 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(AND32rm#suffix) GR32:$src1, !if(ndd, ndd_addr, addr):$src2)>;
+    def : Pat<(and GR64:$src1, (loadi64 !if(ndd, ndd_addr, addr):$src2)),
+              (!cast<Instruction>(AND64rm#suffix) GR64:$src1, !if(ndd, ndd_addr, addr):$src2)>;
 
     // and reg/imm
     def : Pat<(and GR8:$src1, imm:$src2),
@@ -2212,36 +2221,45 @@ multiclass EFLAGSDefiningPats<string suffix, Predicate p> {
               (!cast<Instruction>(AND64ri32#suffix) GR64:$src1, i64immSExt32:$src2)>;
   }
 
+  let Predicates = [PredNDDI] in {
+    def : Pat<(add GR64:$src1, i64immSExt32:$src2), (!cast<Instruction>(ADD64ri32#suffix) GR64:$src1, i64immSExt32:$src2)>;
+    def : Pat<(sub GR64:$src1, i64immSExt32:$src2),
+              (!cast<Instruction>(SUB64ri32#suffix) GR64:$src1, i64immSExt32:$src2)>;
+  }
+
   // Increment/Decrement reg.
   // Do not make INC/DEC if it is slow
-  let Predicates = [UseIncDec, p] in {
+  let Predicates = [UseIncDec, PredNDD] in {
     def : Pat<(add GR8:$src, 1),   (!cast<Instruction>(INC8r#suffix) GR8:$src)>;
     def : Pat<(add GR16:$src, 1),  (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
     def : Pat<(add GR32:$src, 1),  (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
-    def : Pat<(add GR64:$src, 1),  (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
     def : Pat<(add GR8:$src, -1),  (!cast<Instruction>(DEC8r#suffix) GR8:$src)>;
     def : Pat<(add GR16:$src, -1), (!cast<Instruction>(DEC16r#suffix) GR16:$src)>;
     def : Pat<(add GR32:$src, -1), (!cast<Instruction>(DEC32r#suffix) GR32:$src)>;
-    def : Pat<(add GR64:$src, -1), (!cast<Instruction>(DEC64r#suffix) GR64:$src)>;
 
     def : Pat<(X86add_flag_nocf GR8:$src, -1),  (!cast<Instruction>(DEC8r#suffix) GR8:$src)>;
     def : Pat<(X86add_flag_nocf GR16:$src, -1), (!cast<Instruction>(DEC16r#suffix) GR16:$src)>;
     def : Pat<(X86add_flag_nocf GR32:$src, -1), (!cast<Instruction>(DEC32r#suffix) GR32:$src)>;
-    def : Pat<(X86add_flag_nocf GR64:$src, -1), (!cast<Instruction>(DEC64r#suffix) GR64:$src)>;
     def : Pat<(X86sub_flag_nocf GR8:$src, -1),  (!cast<Instruction>(INC8r#suffix) GR8:$src)>;
     def : Pat<(X86sub_flag_nocf GR16:$src, -1), (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
     def : Pat<(X86sub_flag_nocf GR32:$src, -1), (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
-    def : Pat<(X86sub_flag_nocf GR64:$src, -1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
 
     def : Pat<(or_is_add GR8:$src, 1),   (!cast<Instruction>(INC8r#suffix) GR8:$src)>;
     def : Pat<(or_is_add GR16:$src, 1),  (!cast<Instruction>(INC16r#suffix) GR16:$src)>;
     def : Pat<(or_is_add GR32:$src, 1),  (!cast<Instruction>(INC32r#suffix) GR32:$src)>;
+  }
+
+  let Predicates = [UseIncDec, PredNDDI] in {
+    def : Pat<(add GR64:$src, 1),  (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
+    def : Pat<(add GR64:$src, -1), (!cast<Instruction>(DEC64r#suffix) GR64:$src)>;
+    def : Pat<(X86add_flag_nocf GR64:$src, -1), (!cast<Instruction>(DEC64r#suffix) GR64:$src)>;
+    def : Pat<(X86sub_flag_nocf GR64:$src, -1), (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
     def : Pat<(or_is_add GR64:$src, 1),  (!cast<Instruction>(INC64r#suffix) GR64:$src)>;
   }
 }
 
-defm : EFLAGSDefiningPats<"", NoNDD>;
-defm : EFLAGSDefiningPats<"_ND", HasNDD>;
+defm : EFLAGSDefiningPats<"", NoNDD, NoNDDI, 0>;
+defm : EFLAGSDefiningPats<"_ND", HasNDD, HasNDDI, 1>;
 
 let Predicates = [HasZU] in {
   // zext (mul reg/mem, imm) -> imulzu
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index 560b8c378ead7..9b22f6bb767c5 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -143,6 +143,14 @@ const X86FoldTableEntry *llvm::lookupFoldTable(unsigned RegOp, unsigned OpNum) {
   return lookupFoldTableImpl(FoldTable, RegOp);
 }
 
+bool llvm::isNonFoldableWithSameMask(unsigned RegOp) {
+  // NonFoldableWithSameMask table stores instruction opcodes that are unsafe
+  // for masked-load folding when the same mask is used.
+  ArrayRef<unsigned> Table(NonFoldableWithSameMaskTable);
+  auto I = llvm::lower_bound(Table, RegOp);
+  return I != Table.end() && *I == RegOp;
+}
+
 const X86FoldTableEntry *llvm::lookupBroadcastFoldTable(unsigned RegOp,
                                                         unsigned OpNum) {
   ArrayRef<X86FoldTableEntry> FoldTable;
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h
index 9c5dea48d2273..35a3e993e3f96 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.h
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -44,6 +44,10 @@ const X86FoldTableEntry *lookupTwoAddrFoldTable(unsigned RegOp);
 // operand OpNum.
 const X86FoldTableEntry *lookupFoldTable(unsigned RegOp, unsigned OpNum);
 
+// Check if an instruction is unsafe for masked-load folding when the load
+// and instruction have the same mask.
+bool isNonFoldableWithSameMask(unsigned RegOp);
+
 // Look up the broadcast folding table entry for folding a broadcast with
 // operand OpNum.
 const X86FoldTableEntry *lookupBroadcastFoldTable(unsigned RegOp,
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index ea3f833b31e03..3cd05ab0351bd 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -494,8 +494,10 @@ def X86cmpccxadd : SDNode<"X86ISD::CMPCCXADD", SDTX86Cmpccxadd,
                            SDNPMemOperand]>;
 
 // Define X86-specific addressing mode.
-let WantsParent = true in
-def addr      : ComplexPattern<iPTR, 5, "selectAddr">;
+let WantsParent = true in {
+  def addr      : ComplexPattern<iPTR, 5, "selectAddr">;
+  def ndd_addr  : ComplexPattern<iPTR, 5, "selectNDDAddr">;
+}
 def gi_addr   : GIComplexOperandMatcher<s32, "selectAddr">,
                 GIComplexPatternEquiv<addr>;
 def lea32addr : ComplexPattern<i32, 5, "selectLEAAddr",
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 2479a8dccfb00..c862d902b218a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -5698,7 +5698,7 @@ static bool canConvert2Copy(unsigned Opc) {
 
 /// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
 ///     ADD32rr  ==>  ADD32ri
-static unsigned convertALUrr2ALUri(unsigned Opc) {
+static unsigned convertALUrr2ALUri(unsigned Opc, bool HasNDDI) {
   switch (Opc) {
   default:
     return 0;
@@ -5707,9 +5707,7 @@ static unsigned convertALUrr2ALUri(unsigned Opc) {
     return X86::TO;                                                            \
   case X86::FROM##_ND:                                                         \
     return X86::TO##_ND;
-    FROM_TO(ADD64rr, ADD64ri32)
     FROM_TO(ADC64rr, ADC64ri32)
-    FROM_TO(SUB64rr, SUB64ri32)
     FROM_TO(SBB64rr, SBB64ri32)
     FROM_TO(AND64rr, AND64ri32)
     FROM_TO(OR64rr, OR64ri32)
@@ -5739,6 +5737,8 @@ static unsigned convertALUrr2ALUri(unsigned Opc) {
 #define FROM_TO(FROM, TO)                                                      \
   case X86::FROM:                                                              \
     return X86::TO;
+    FROM_TO(ADD64rr, ADD64ri32)
+    FROM_TO(SUB64rr, SUB64ri32)
     FROM_TO(TEST64rr, TEST64ri32)
     FROM_TO(CTEST64rr, CTEST64ri32)
     FROM_TO(CMP64rr, CMP64ri32)
@@ -5748,6 +5748,10 @@ static unsigned convertALUrr2ALUri(unsigned Opc) {
     FROM_TO(CMP32rr, CMP32ri)
     FROM_TO(CCMP32rr, CCMP32ri)
 #undef FROM_TO
+  case X86::ADD64rr_ND:
+    return HasNDDI ? X86::ADD64ri32_ND : 0;
+  case X86::SUB64rr_ND:
+    return HasNDDI ? X86::SUB64ri32_ND : 0;
   }
 }
 
@@ -5834,7 +5838,7 @@ bool X86InstrInfo::foldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
     else
       return false;
   } else
-    NewOpc = convertALUrr2ALUri(Opc);
+    NewOpc = convertALUrr2ALUri(Opc, Subtarget.hasNDDI());
 
   if (!NewOpc)
     return false;
@@ -7534,6 +7538,10 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   //
   // Utilize the mapping NonNDD -> RMW for the NDD variant.
   unsigned NonNDOpc = Subtarget.hasNDD() ? X86::getNonNDVariant(Opc) : 0U;
+  // Disable memory folding for NDD instructions.
+  if (NonNDOpc && !Subtarget.hasNDDM())
+    return nullptr;
+
   const X86FoldTableEntry *I =
       IsTwoAddr ? lookupTwoAddrFoldTable(NonNDOpc ? NonNDOpc : Opc)
                 : lookupFoldTable(Opc, OpNum);
@@ -8147,6 +8155,11 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       MaskReg = Op2.getReg();
 
     if (MaskReg) {
+      // Some instructions are invalid to fold into even with the same mask.
+      // Folding is unsafe if an active destination element may read from a
+      // source element that is masked off.
+      if (isNonFoldableWithSameMask(MI.getOpcode()))
+        return nullptr;
       bool HasSameMask = false;
       for (unsigned I = 1, E = MI.getDesc().getNumOperands(); I < E; ++I) {
         const MachineOperand &Op = MI.getOperand(I);
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 21e6bacbacee2..2baf90eca15ea 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -50,7 +50,10 @@ def PreferLegacySetCC  : Predicate<"!Subtarget->hasZU() || "
                                    "Subtarget->preferLegacySetCC()">;
 def PreferNoLegacySetCC : Predicate<"Subtarget->hasZU() && "
                                     "!Subtarget->preferLegacySetCC()">;
+def HasNDDI      : Predicate<"Subtarget->hasNDD() && Subtarget->hasNDDI()">;
+def NoNDDI       : Predicate<"!Subtarget->hasNDD() || !Subtarget->hasNDDI()">;
 def HasCF        : Predicate<"Subtarget->hasCF()">;
+def HasJMPABS    : Predicate<"Subtarget->hasJMPABS()">;
 def HasCMOV      : Predicate<"Subtarget->canUseCMOV()">;
 def NoCMOV       : Predicate<"!Subtarget->canUseCMOV()">;
 def HasNOPL      : Predicate<"Subtarget->hasNOPL()">;
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 2a5488847e648..7e7c2f97c5793 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -280,23 +280,23 @@ def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst),
           (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>;
 
 let Predicates = [HasNDD] in {
-def : Pat<(rotl (loadi8 addr:$src), (i8 7)),
-          (ROR8m1_ND addr:$src)>;
-def : Pat<(rotl (loadi16 addr:$src), (i8 15)),
-          (ROR16m1_ND addr:$src)>;
-def : Pat<(rotl (loadi32 addr:$src), (i8 31)),
-          (ROR32m1_ND addr:$src)>;
-def : Pat<(rotl (loadi64 addr:$src), (i8 63)),
-          (ROR64m1_ND addr:$src)>;
-
-def : Pat<(rotr (loadi8 addr:$src), (i8 7)),
-          (ROL8m1_ND addr:$src)>;
-def : Pat<(rotr (loadi16 addr:$src), (i8 15)),
-          (ROL16m1_ND addr:$src)>;
-def : Pat<(rotr (loadi32 addr:$src), (i8 31)),
-          (ROL32m1_ND addr:$src)>;
-def : Pat<(rotr (loadi64 addr:$src), (i8 63)),
-          (ROL64m1_ND addr:$src)>;
+def : Pat<(rotl (loadi8 ndd_addr:$src), (i8 7)),
+          (ROR8m1_ND ndd_addr:$src)>;
+def : Pat<(rotl (loadi16 ndd_addr:$src), (i8 15)),
+          (ROR16m1_ND ndd_addr:$src)>;
+def : Pat<(rotl (loadi32 ndd_addr:$src), (i8 31)),
+          (ROR32m1_ND ndd_addr:$src)>;
+def : Pat<(rotl (loadi64 ndd_addr:$src), (i8 63)),
+          (ROR64m1_ND ndd_addr:$src)>;
+
+def : Pat<(rotr (loadi8 ndd_addr:$src), (i8 7)),
+          (ROL8m1_ND ndd_addr:$src)>;
+def : Pat<(rotr (loadi16 ndd_addr:$src), (i8 15)),
+          (ROL16m1_ND ndd_addr:$src)>;
+def : Pat<(rotr (loadi32 ndd_addr:$src), (i8 31)),
+          (ROL32m1_ND ndd_addr:$src)>;
+def : Pat<(rotr (loadi64 ndd_addr:$src), (i8 63)),
+          (ROL64m1_ND ndd_addr:$src)>;
 }
 
 // Patterns for rotate with relocImm for the immediate field.
@@ -393,8 +393,8 @@ class ShlrdOpMRI8U_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node
   let SchedRW = [WriteSHDmri];
   let mayLoad = 1;
   let Pattern = !if(!eq(m, "shld"),
-                    [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), t.RegClass:$src2, (i8 imm:$src3)))],
-                    [(set t.RegClass:$dst, (node t.RegClass:$src2, (t.LoadNode addr:$src1), (i8 imm:$src3)))]);
+                    [(set t.RegClass:$dst, (node (t.LoadNode ndd_addr:$src1), t.RegClass:$src2, (i8 imm:$src3)))],
+                    [(set t.RegClass:$dst, (node t.RegClass:$src2, (t.LoadNode ndd_addr:$src1), (i8 imm:$src3)))]);
 }
 
 class ShlrdOpMRC_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag>
@@ -402,8 +402,8 @@ class ShlrdOpMRC_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node =
   let Uses = [CL];
   let SchedRW = [WriteSHDmrcl];
   let Pattern = !if(!eq(m, "shld"),
-                    [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), t.RegClass:$src2, CL))],
-                    [(set t.RegClass:$dst, (node t.RegClass:$src2, (t.LoadNode addr:$src1), CL))]);
+                    [(set t.RegClass:$dst, (node (t.LoadNode ndd_addr:$src1), t.RegClass:$src2, CL))],
+                    [(set t.RegClass:$dst, (node t.RegClass:$src2, (t.LoadNode ndd_addr:$src1), CL))]);
 }
 
 multiclass Shlrd<bits<8> o1, bits<8> o2, bits<8> o3, string m, SDPatternOperator node, SDPatternOperator t_node> {
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 6aae90b77fbad..6fcbfe76bdfbc 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -1039,13 +1039,13 @@ class BinOpRM_R<bits<8> o, string m, X86TypeInfo t, bit ndd = 0>
 class BinOpRM_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, bit ndd = 0>
   : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
             [(set t.RegClass:$dst, EFLAGS, (node t.RegClass:$src1,
-             (t.LoadNode addr:$src2)))]>, DefEFLAGS, NDD<ndd>;
+             (t.LoadNode !if(ndd, ndd_addr, addr):$src2)))]>, DefEFLAGS, NDD<ndd>;
 // BinOpRMF_RF - Instructions that read "reg, [mem]", write "reg" and read/write
 // EFLAGS.
 class BinOpRMF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
   : BinOpRM<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, (outs t.RegClass:$dst),
             [(set t.RegClass:$dst, EFLAGS,
-             (node t.RegClass:$src1, (t.LoadNode addr:$src2), EFLAGS))]>,
+             (node t.RegClass:$src1, (t.LoadNode !if(ndd, ndd_addr, addr):$src2), EFLAGS))]>,
     DefEFLAGS, UseEFLAGS, NDD<ndd> {
   let SchedRW = [WriteADC.Folded, WriteADC.ReadAfterFold,
                  // base, scale, index, offset, segment.
@@ -1126,7 +1126,7 @@ class BinOpMR_R<bits<8> o, string m, X86TypeInfo t>
 // BinOpMR_RF - Instructions that read "[mem], reg", and write "reg", EFLAGS.
 class BinOpMR_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
   : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
-            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1),
+            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode ndd_addr:$src1),
              t.RegClass:$src2))]>, DefEFLAGS, NDD<1>;
 // BinOpMR_F - Instructions that read "[mem], imm8" and write EFLAGS only.
 class BinOpMR_F<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
@@ -1157,7 +1157,7 @@ class BinOpMR_MF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
 // read/write EFLAGS.
 class BinOpMRF_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
   : BinOpMR<o, m, binop_ndd_args, t, (outs t.RegClass:$dst),
-            [(set t.RegClass:$dst, EFLAGS, (node (load addr:$src1),
+            [(set t.RegClass:$dst, EFLAGS, (node (load ndd_addr:$src1),
              t.RegClass:$src2, EFLAGS))]>, DefEFLAGS, UseEFLAGS, NDD<1>,
     Sched<[WriteADC.Folded, WriteADC.ReadAfterFold]>;
 // BinOpMRF_MF - Instructions that read "[mem], reg", write "[mem]" and
@@ -1197,7 +1197,7 @@ class BinOpMI_R<bits<8> o, string m, X86TypeInfo t, Format f>
 class BinOpMI_RF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node,
                 Format f>
   : BinOpMI<o, m, binop_ndd_args, t, f, (outs t.RegClass:$dst),
-            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1), t.ImmOperator:$src2))]>,
+            [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode ndd_addr:$src1), t.ImmOperator:$src2))]>,
     Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
 // BinOpMI_M - Instructions that read "[mem], imm" and write "[mem]".
 class BinOpMI_M<bits<8> o, string m, X86TypeInfo t, Format f>
@@ -1216,7 +1216,7 @@ class BinOpMI_MF<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node, For
 // read/write EFLAGS.
 class BinOpMIF_RF<bits<8> o, string m, X86TypeInfo t, SDNode node, Format f>
   : BinOpMI<o, m, binop_ndd_args, t, f, (outs t.RegClass:$dst),
-            [(set t.RegClass:$dst, EFLAGS, (node (t.VT (load addr:$src1)),
+            [(set t.RegClass:$dst, EFLAGS, (node (t.VT (load ndd_addr:$src1)),
              t.ImmOperator:$src2, EFLAGS))]>,
     Sched<[WriteADC.Folded, WriteADC.ReadAfterFold]>, DefEFLAGS, UseEFLAGS, NDD<1>;
 // BinOpMIF_MF - Instructions that read "[mem], imm", write "[mem]" and
@@ -1251,7 +1251,7 @@ class BinOpMI8_R<string m, X86TypeInfo t, Format f>
 // BinOpMI8U_R - Instructions that read "[mem], u8imm" and write "reg".
 class BinOpMI8U_R<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag>
   : BinOpMI8U<m, binop_ndd_args, t, f, (outs t.RegClass:$dst),
-              [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), (i8 imm:$src2)))]>, NDD<1>;
+              [(set t.RegClass:$dst, (node (t.LoadNode ndd_addr:$src1), (i8 imm:$src2)))]>, NDD<1>;
 // BinOpMI8_RF - Instructions that read "[mem], imm8" and write "reg"/EFLAGS.
 class BinOpMI8_RF<string m, X86TypeInfo t, Format f>
   : BinOpMI8<m, binop_ndd_args, t, f, (outs t.RegClass:$dst)>, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
@@ -1325,7 +1325,7 @@ class BinOpMC_M<string m, Format f, X86TypeInfo t, SDPatternOperator node = null
 // BinOpMC_R - Instructions that read "[mem], cl" and write reg.
 class BinOpMC_R<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag>
   : ITy<0xD3, f, t, (outs t.RegClass:$dst), (ins t.MemOperand:$src1), m, binop_cl_ndd_args,
-        [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), CL))]>, NDD<1> {
+        [(set t.RegClass:$dst, (node (t.LoadNode ndd_addr:$src1), CL))]>, NDD<1> {
   let Uses = [CL];
   let mayLoad = 1;
 }
@@ -1358,13 +1358,13 @@ class UnaryOpM<bits<8> o, Format f, string m, string args, X86TypeInfo t,
 class UnaryOpM_R<bits<8> o, Format f, string m, X86TypeInfo t,
                   SDPatternOperator node = null_frag>
   : UnaryOpM<o, f, m, unaryop_ndd_args, t, (outs t.RegClass:$dst),
-             [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1)))]>,
+             [(set t.RegClass:$dst, (node (t.LoadNode ndd_addr:$src1)))]>,
     Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, NDD<1>;
 // UnaryOpM_RF - Instructions that read "[mem]" and writes "reg"/EFLAGS.
 class UnaryOpM_RF<bits<8> o, Format f, string m, X86TypeInfo t,
                   SDPatternOperator node = null_frag>
   : UnaryOpM<o, f, m, unaryop_ndd_args, t, (outs t.RegClass:$dst),
-             [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1)))]>,
+             [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode ndd_addr:$src1)))]>,
     Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
 // UnaryOpM_M - Instructions that read "[mem]" and writes "[mem]".
 class UnaryOpM_M<bits<8> o, Format f, string m, X86TypeInfo t,
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 0d4131632ff56..48fff529c6565 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -370,7 +370,7 @@ MCOperand X86MCInstLower::LowerMachineOperand(const MachineInstr *MI,
 
 // Replace TAILJMP opcodes with their equivalent opcodes that have encoding
 // information.
-static unsigned convertTailJumpOpcode(unsigned Opcode) {
+static unsigned convertTailJumpOpcode(unsigned Opcode, bool IsLarge = false) {
   switch (Opcode) {
   case X86::TAILJMPr:
     Opcode = X86::JMP32r;
@@ -392,7 +392,7 @@ static unsigned convertTailJumpOpcode(unsigned Opcode) {
     break;
   case X86::TAILJMPd:
   case X86::TAILJMPd64:
-    Opcode = X86::JMP_1;
+    Opcode = IsLarge ? X86::JMPABS64i : X86::JMP_1;
     break;
   case X86::TAILJMPd_CC:
   case X86::TAILJMPd64_CC:
@@ -485,10 +485,17 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   case X86::TAILJMPr64:
   case X86::TAILJMPr64_REX:
   case X86::TAILJMPd:
-  case X86::TAILJMPd64:
     assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
     OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode()));
     break;
+  case X86::TAILJMPd64: {
+    assert(OutMI.getNumOperands() == 1 && "Unexpected number of operands!");
+    bool IsLarge = TM.getCodeModel() == CodeModel::Large;
+    assert((!IsLarge || AsmPrinter.getSubtarget().hasJMPABS()) &&
+           "Unexpected TAILJMPd64 in large code model without JMPABS");
+    OutMI.setOpcode(convertTailJumpOpcode(OutMI.getOpcode(), IsLarge));
+    break;
+  }
   case X86::TAILJMPd_CC:
   case X86::TAILJMPd64_CC:
     assert(OutMI.getNumOperands() == 2 && "Unexpected number of operands!");
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 4e2e98410f325..a36fd214e33f5 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -264,8 +264,9 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef TuneCPU,
     FullFS = (Twine(FullFS) + "," + FS).str();
 
   // Disable 64-bit only features in non-64-bit mode.
-  StringRef FeaturesIn64BitOnly[] = {
-      "egpr", "push2pop2", "ppx", "ndd", "ccmp", "nf", "cf", "zu", "uintr"};
+  StringRef FeaturesIn64BitOnly[] = {"egpr",   "push2pop2", "ppx", "ndd",
+                                     "ccmp",   "nf",        "cf",  "zu",
+                                     "jmpabs", "uintr"};
   if (FullFS.find("-64bit-mode") != std::string::npos)
     for (StringRef F : FeaturesIn64BitOnly)
       FullFS += ",-" + F.str();
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 5305b39cffefd..1e153c4a6105c 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -625,7 +625,7 @@ void X86PassConfig::addPreEmitPass2() {
 
   // KCFI indirect call checks are lowered to a bundle, and on Darwin platforms,
   // also CALL_RVMARKER.
-  addPass(createUnpackMachineBundles([&TT](const MachineFunction &MF) {
+  addPass(createUnpackMachineBundlesLegacy([&TT](const MachineFunction &MF) {
     // Only run bundle expansion if the module uses kcfi, or there are relevant
     // ObjC runtime functions present in the module.
     const Function &F = MF.getFunction();
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index e1bdfbe42d07f..dfe97f178bd46 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -2184,6 +2184,7 @@ StringMap<bool> sys::getHostCPUFeatures() {
   Features["nf"] = HasAPXF;
   Features["cf"] = HasAPXF;
   Features["zu"] = HasAPXF;
+  Features["jmpabs"] = HasAPXF;
 
   bool HasLeafD = MaxLevel >= 0xd &&
                   !getX86CpuIDAndInfoEx(0xd, 0x1, &EAX, &EBX, &ECX, &EDX);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index f848b1ac08607..b3859eb4ff2fd 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -142,8 +142,8 @@ constexpr FeatureBitset FeaturesDiamondRapids =
     FeatureCMPCCXADD | FeatureAVXIFMA | FeatureAVXNECONVERT |
     FeatureAVXVNNIINT8 | FeatureAVXVNNIINT16 | FeatureSHA512 | FeatureSM3 |
     FeatureSM4 | FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 |
-    FeaturePPX | FeatureNDD | FeatureNF | FeatureMOVRS | FeatureAMX_MOVRS |
-    FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32;
+    FeaturePPX | FeatureNDD | FeatureNF | FeatureJMPABS | FeatureMOVRS |
+    FeatureAMX_MOVRS | FeatureAMX_AVX512 | FeatureAMX_FP8 | FeatureAMX_TF32;
 
 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@@ -178,7 +178,7 @@ constexpr FeatureBitset FeaturesPantherlake =
 constexpr FeatureBitset FeaturesNovalake =
     FeaturesPantherlake | FeaturePREFETCHI | FeatureAVX10_2 | FeatureMOVRS |
     FeatureEGPR | FeatureZU | FeatureCCMP | FeaturePush2Pop2 | FeaturePPX |
-    FeatureNDD | FeatureNF;
+    FeatureNDD | FeatureNF | FeatureJMPABS;
 constexpr FeatureBitset FeaturesClearwaterforest =
     (FeaturesSierraforest ^ FeatureWIDEKL) | FeatureAVXVNNIINT16 |
     FeatureSHA512 | FeatureSM3 | FeatureSM4 | FeaturePREFETCHI | FeatureUSERMSR;
@@ -659,11 +659,12 @@ constexpr FeatureBitset ImpliedFeaturesCCMP = {};
 constexpr FeatureBitset ImpliedFeaturesNF = {};
 constexpr FeatureBitset ImpliedFeaturesCF = {};
 constexpr FeatureBitset ImpliedFeaturesZU = {};
+constexpr FeatureBitset ImpliedFeaturesJMPABS = {};
 
 constexpr FeatureBitset ImpliedFeaturesAPXF =
     ImpliedFeaturesEGPR | ImpliedFeaturesPush2Pop2 | ImpliedFeaturesPPX |
     ImpliedFeaturesNDD | ImpliedFeaturesCCMP | ImpliedFeaturesNF |
-    ImpliedFeaturesCF | ImpliedFeaturesZU;
+    ImpliedFeaturesCF | ImpliedFeaturesZU | ImpliedFeaturesJMPABS;
 
 constexpr FeatureBitset ImpliedFeaturesMOVRS = {};
 
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 369310deb6e39..c53435db356bd 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -640,6 +640,189 @@ static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL) {
   return true;
 }
 
+// Check if this array of constants represents a log2 table.
+// Iterate over the elements from \p Table by trying to find/match all
+// the numbers from 0 to \p InputBits that should represent log2 results.
+static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
+                        Type *AccessTy, unsigned InputBits,
+                        const APInt &GEPIdxFactor, const DataLayout &DL) {
+  for (unsigned Idx = 0; Idx < InputBits; Idx++) {
+    APInt Index = (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift);
+    ConstantInt *C = dyn_cast_or_null<ConstantInt>(
+        ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
+    if (!C || C->getValue() != Idx)
+      return false;
+  }
+
+  // Verify that an input of zero will select table index 0.
+  APInt ZeroIndex = Mul.lshr(Shift);
+  if (!ZeroIndex.isZero())
+    return false;
+
+  return true;
+}
+
+// Try to recognize table-based log2 implementation.
+// E.g., an example in C (for more cases please the llvm/tests):
+// int f(unsigned v) {
+//    static const char table[32] =
+//    {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
+//     8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
+//
+//    v |= v >> 1; // first round down to one less than a power of 2
+//    v |= v >> 2;
+//    v |= v >> 4;
+//    v |= v >> 8;
+//    v |= v >> 16;
+//
+//    return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+// }
+// this can be lowered to `ctlz` instruction.
+// There is also a special case when the element is 0.
+//
+// The >> and |= sequence sets all bits below the most significant set bit. The
+// multiply is a de-bruijn sequence that contains each pattern of bits in it.
+// The shift extracts the top bits after the multiply, and that index into the
+// table should represent the floor log base 2 of the original number.
+//
+// Here are some examples of LLVM IR for a 64-bit target.
+//
+// CASE 1:
+// %shr = lshr i32 %v, 1
+// %or = or i32 %shr, %v
+// %shr1 = lshr i32 %or, 2
+// %or2 = or i32 %shr1, %or
+// %shr3 = lshr i32 %or2, 4
+// %or4 = or i32 %shr3, %or2
+// %shr5 = lshr i32 %or4, 8
+// %or6 = or i32 %shr5, %or4
+// %shr7 = lshr i32 %or6, 16
+// %or8 = or i32 %shr7, %or6
+// %mul = mul i32 %or8, 130329821
+// %shr9 = lshr i32 %mul, 27
+// %idxprom = zext nneg i32 %shr9 to i64
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
+// %0 = load i8, ptr %arrayidx, align 1
+//
+// CASE 2:
+// %shr = lshr i64 %v, 1
+// %or = or i64 %shr, %v
+// %shr1 = lshr i64 %or, 2
+// %or2 = or i64 %shr1, %or
+// %shr3 = lshr i64 %or2, 4
+// %or4 = or i64 %shr3, %or2
+// %shr5 = lshr i64 %or4, 8
+// %or6 = or i64 %shr5, %or4
+// %shr7 = lshr i64 %or6, 16
+// %or8 = or i64 %shr7, %or6
+// %shr9 = lshr i64 %or8, 32
+// %or10 = or i64 %shr9, %or8
+// %mul = mul i64 %or10, 285870213051386505
+// %shr11 = lshr i64 %mul, 58
+// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
+// %0 = load i8, ptr %arrayidx, align 1
+//
+// All these can be lowered to @llvm.ctlz.i32/64 intrinsics and a subtract.
+static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL,
+                                         TargetTransformInfo &TTI) {
+  LoadInst *LI = dyn_cast<LoadInst>(&I);
+  if (!LI)
+    return false;
+
+  Type *AccessType = LI->getType();
+  if (!AccessType->isIntegerTy())
+    return false;
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+  if (!GEP || !GEP->hasNoUnsignedSignedWrap())
+    return false;
+
+  GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+  if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
+    return false;
+
+  unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
+  APInt ModOffset(BW, 0);
+  SmallMapVector<Value *, APInt, 4> VarOffsets;
+  if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
+      VarOffsets.size() != 1 || ModOffset != 0)
+    return false;
+  auto [GepIdx, GEPScale] = VarOffsets.front();
+
+  Value *X;
+  const APInt *MulConst, *ShiftConst;
+  // Check that the gep variable index is (x * MulConst) >> ShiftConst.
+  auto MatchInner =
+      m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
+  if (!match(GepIdx, m_CastOrSelf(MatchInner)))
+    return false;
+
+  unsigned InputBits = X->getType()->getScalarSizeInBits();
+  if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
+    return false;
+
+  // Verify shift amount.
+  // TODO: Allow other shift amounts when we have proper test coverage.
+  if (*ShiftConst != InputBits - Log2_32(InputBits))
+    return false;
+
+  // Match the sequence of OR operations with right shifts by powers of 2.
+  for (unsigned ShiftAmt = InputBits / 2; ShiftAmt != 0; ShiftAmt /= 2) {
+    Value *Y;
+    if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(ShiftAmt)),
+                         m_Deferred(Y))))
+      return false;
+    X = Y;
+  }
+
+  if (!GEPScale.isIntN(InputBits) ||
+      !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
+                   AccessType, InputBits, GEPScale.zextOrTrunc(InputBits), DL))
+    return false;
+
+  ConstantInt *ZeroTableElem = cast<ConstantInt>(
+      ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
+
+  // Use InputBits - 1 - ctlz(X) to compute log2(X).
+  IRBuilder<> B(LI);
+  ConstantInt *BoolConst = B.getTrue();
+  Type *XType = X->getType();
+
+  // Check the the backend has an efficient ctlz instruction.
+  // FIXME: Teach the backend to emit the original code when ctlz isn't
+  // supported like we do for cttz.
+  IntrinsicCostAttributes Attrs(
+      Intrinsic::ctlz, XType,
+      {PoisonValue::get(XType), /*is_zero_poison=*/BoolConst});
+  InstructionCost Cost =
+      TTI.getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+  if (Cost > TargetTransformInfo::TCC_Basic)
+    return false;
+
+  Value *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {X, BoolConst});
+
+  Constant *InputBitsM1 = ConstantInt::get(XType, InputBits - 1);
+  Value *Sub = B.CreateSub(InputBitsM1, Ctlz);
+
+  // The table won't produce a sensible result for 0.
+  Value *Cmp = B.CreateICmpEQ(X, ConstantInt::get(XType, 0));
+  Value *Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Sub);
+
+  // The true branch of select handles the log2(0) case, which is rare.
+  if (!ProfcheckDisableMetadataFixes) {
+    if (Instruction *SelectI = dyn_cast<Instruction>(Select))
+      SelectI->setMetadata(
+          LLVMContext::MD_prof,
+          MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
+  }
+
+  Value *ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
+
+  LI->replaceAllUsesWith(ZExtOrTrunc);
+
+  return true;
+}
+
 /// This is used by foldLoadsRecursive() to capture a Root Load node which is
 /// of type or(load, load) and recursively build the wide load. Also capture the
 /// shift amount, zero extend type and loadSize.
@@ -1320,7 +1503,7 @@ void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,
         BasicBlock::Create(Ctx, "sub_" + Twine(I), BBCI->getParent(), BBTail));
   BasicBlock *BBNE = BasicBlock::Create(Ctx, "ne", BBCI->getParent(), BBTail);
 
-  cast<BranchInst>(BBCI->getTerminator())->setSuccessor(0, BBSubs[0]);
+  cast<UncondBrInst>(BBCI->getTerminator())->setSuccessor(BBSubs[0]);
 
   B.SetInsertPoint(BBNE);
   PHINode *Phi = B.CreatePHI(CI->getType(), N);
@@ -1337,7 +1520,7 @@ void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,
         ConstantInt::get(CI->getType(), static_cast<unsigned char>(RHS[i]));
     Value *Sub = Swapped ? B.CreateSub(VR, VL) : B.CreateSub(VL, VR);
     if (i < N - 1) {
-      BranchInst *CondBrInst = B.CreateCondBr(
+      CondBrInst *CondBrInst = B.CreateCondBr(
           B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)), BBNE,
           BBSubs[i + 1]);
 
@@ -1842,6 +2025,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT,
       MadeChange |= tryToRecognizePopCount(I);
       MadeChange |= tryToFPToSat(I, TTI);
       MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
+      MadeChange |= tryToRecognizeTableBasedLog2(I, DL, TTI);
       MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
       MadeChange |= foldPatternedLoads(I, DL);
       MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index c592d26fa7710..f780a7bbc8f66 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -687,8 +687,7 @@ void coro::BaseCloner::replaceEntryBlock() {
   // exactly one predecessor, which we created when splitting out
   // AllocaSpillBlock to begin with.
   assert(Entry->hasOneUse());
-  auto BranchToEntry = cast<BranchInst>(Entry->user_back());
-  assert(BranchToEntry->isUnconditional());
+  auto BranchToEntry = cast<UncondBrInst>(Entry->user_back());
   Builder.SetInsertPoint(BranchToEntry);
   Builder.CreateUnreachable();
   BranchToEntry->eraseFromParent();
@@ -717,8 +716,7 @@ void coro::BaseCloner::replaceEntryBlock() {
              Shape.ABI == coro::ABI::RetconOnce) &&
             isa<CoroSuspendRetconInst>(ActiveSuspend)));
     auto *MappedCS = cast<AnyCoroSuspendInst>(VMap[ActiveSuspend]);
-    auto Branch = cast<BranchInst>(MappedCS->getNextNode());
-    assert(Branch->isUnconditional());
+    auto Branch = cast<UncondBrInst>(MappedCS->getNextNode());
     Builder.CreateBr(Branch->getSuccessor(0));
     break;
   }
@@ -820,10 +818,10 @@ static void updateScopeLine(Instruction *ActiveSuspend,
   // instructions are not in the same BB.
   // FIXME: remove this hardcoded number of tries.
   for (unsigned Repeat = 0; Repeat < 2; Repeat++) {
-    auto *Branch = dyn_cast_or_null<BranchInst>(Successor);
-    if (!Branch || !Branch->isUnconditional())
+    auto *Branch = dyn_cast_or_null<UncondBrInst>(Successor);
+    if (!Branch)
       break;
-    Successor = Branch->getSuccessor(0)->getFirstNonPHIOrDbg();
+    Successor = Branch->getSuccessor()->getFirstNonPHIOrDbg();
   }
 
   // Find the first successor of ActiveSuspend with a non-zero line location.
@@ -1307,7 +1305,7 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
 
   // No longer need a call to coro.resume or coro.destroy.
   if (auto *Invoke = dyn_cast<InvokeInst>(CB)) {
-    BranchInst::Create(Invoke->getNormalDest(), Invoke->getIterator());
+    UncondBrInst::Create(Invoke->getNormalDest(), Invoke->getIterator());
   }
 
   // Grab the CalledValue from CB before erasing the CallInstr.
@@ -1563,7 +1561,7 @@ struct SwitchCoroutineSplitter {
           S->getNextNode(), ResumeBB->getName() + Twine(".landing"));
       Switch->addCase(IndexVal, ResumeBB);
 
-      cast<BranchInst>(SuspendBB->getTerminator())->setSuccessor(0, LandingBB);
+      cast<UncondBrInst>(SuspendBB->getTerminator())->setSuccessor(LandingBB);
       auto *PN = PHINode::Create(Builder.getInt8Ty(), 2, "");
       PN->insertBefore(LandingBB->begin());
       S->replaceAllUsesWith(PN);
@@ -1782,7 +1780,7 @@ void coro::AsyncABI::splitCoroutine(Function &F, coro::Shape &Shape,
     // point.
     auto *SuspendBB = Suspend->getParent();
     auto *NewSuspendBB = SuspendBB->splitBasicBlock(Suspend);
-    auto *Branch = cast<BranchInst>(SuspendBB->getTerminator());
+    auto *Branch = cast<UncondBrInst>(SuspendBB->getTerminator());
 
     // Place it before the first suspend.
     auto *ReturnBB =
@@ -1880,7 +1878,7 @@ void coro::AnyRetconABI::splitCoroutine(Function &F, coro::Shape &Shape,
     // the suspend point.
     auto SuspendBB = Suspend->getParent();
     auto NewSuspendBB = SuspendBB->splitBasicBlock(Suspend);
-    auto Branch = cast<BranchInst>(SuspendBB->getTerminator());
+    auto Branch = cast<UncondBrInst>(SuspendBB->getTerminator());
 
     // Create the unified return block.
     if (!ReturnBB) {
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index bdc3c51f3162d..8079fd89c9c37 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2472,7 +2472,7 @@ ChangeStatus Attributor::cleanupIR() {
           Callee->removeParamAttr(Idx, Attribute::NoUndef);
       }
     }
-    if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) {
+    if (isa<Constant>(NewV) && isa<CondBrInst>(U->getUser())) {
       Instruction *UserI = cast<Instruction>(U->getUser());
       if (isa<UndefValue>(NewV)) {
         ToBeChangedToUnreachableInsts.insert(UserI);
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index b5703f7b87e74..95c0531c2183b 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -664,11 +664,10 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
   if (S.isAtFixpoint())
     return;
 
-  SmallVector<const BranchInst *, 4> BrInsts;
+  SmallVector<const CondBrInst *, 4> BrInsts;
   auto Pred = [&](const Instruction *I) {
-    if (const BranchInst *Br = dyn_cast<BranchInst>(I))
-      if (Br->isConditional())
-        BrInsts.push_back(Br);
+    if (const CondBrInst *Br = dyn_cast<CondBrInst>(I))
+      BrInsts.push_back(Br);
     return true;
   };
 
@@ -705,7 +704,7 @@ static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
   // }
 
   Explorer->checkForAllContext(&CtxI, Pred);
-  for (const BranchInst *Br : BrInsts) {
+  for (const CondBrInst *Br : BrInsts) {
     StateType ParentState;
 
     // The known state of the parent state is a conjunction of children's
@@ -4698,26 +4697,30 @@ identifyAliveSuccessors(Attributor &A, const InvokeInst &II,
 }
 
 static bool
-identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
+identifyAliveSuccessors(Attributor &, const UncondBrInst &BI,
+                        AbstractAttribute &,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  AliveSuccessors.push_back(&BI.getSuccessor()->front());
+  return false;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const CondBrInst &BI,
                         AbstractAttribute &AA,
                         SmallVectorImpl<const Instruction *> &AliveSuccessors) {
   bool UsedAssumedInformation = false;
-  if (BI.getNumSuccessors() == 1) {
-    AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+  std::optional<Constant *> C =
+      A.getAssumedConstant(*BI.getCondition(), AA, UsedAssumedInformation);
+  if (!C || isa_and_nonnull<UndefValue>(*C)) {
+    // No value yet, assume both edges are dead.
+  } else if (isa_and_nonnull<ConstantInt>(*C)) {
+    const BasicBlock *SuccBB =
+        BI.getSuccessor(1 - cast<ConstantInt>(*C)->getValue().getZExtValue());
+    AliveSuccessors.push_back(&SuccBB->front());
   } else {
-    std::optional<Constant *> C =
-        A.getAssumedConstant(*BI.getCondition(), AA, UsedAssumedInformation);
-    if (!C || isa_and_nonnull<UndefValue>(*C)) {
-      // No value yet, assume both edges are dead.
-    } else if (isa_and_nonnull<ConstantInt>(*C)) {
-      const BasicBlock *SuccBB =
-          BI.getSuccessor(1 - cast<ConstantInt>(*C)->getValue().getZExtValue());
-      AliveSuccessors.push_back(&SuccBB->front());
-    } else {
-      AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
-      AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
-      UsedAssumedInformation = false;
-    }
+    AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+    AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
+    UsedAssumedInformation = false;
   }
   return UsedAssumedInformation;
 }
@@ -4831,8 +4834,11 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
                                                        *this, AliveSuccessors);
       break;
     case Instruction::UncondBr:
+      UsedAssumedInformation = identifyAliveSuccessors(
+          A, cast<UncondBrInst>(*I), *this, AliveSuccessors);
+      break;
     case Instruction::CondBr:
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I),
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<CondBrInst>(*I),
                                                        *this, AliveSuccessors);
       break;
     case Instruction::Switch:
@@ -6912,7 +6918,7 @@ struct AAHeapToStackFunction final : public AAHeapToStack {
 
       if (auto *II = dyn_cast<InvokeInst>(AI.CB)) {
         auto *NBB = II->getNormalDest();
-        BranchInst::Create(NBB, AI.CB->getParent());
+        UncondBrInst::Create(NBB, AI.CB->getParent());
         A.deleteAfterManifest(*AI.CB);
       } else {
         A.deleteAfterManifest(*AI.CB);
@@ -12520,13 +12526,13 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
       BasicBlock *CBBB = CB->getParent();
       A.registerManifestAddedBasicBlock(*ThenTI->getParent());
       A.registerManifestAddedBasicBlock(*IP->getParent());
-      auto *SplitTI = cast<BranchInst>(LastCmp->getNextNode());
+      auto *SplitTI = cast<CondBrInst>(LastCmp->getNextNode());
       BasicBlock *ElseBB;
       if (&*IP == CB) {
         ElseBB = BasicBlock::Create(ThenTI->getContext(), "",
                                     ThenTI->getFunction(), CBBB);
         A.registerManifestAddedBasicBlock(*ElseBB);
-        IP = BranchInst::Create(CBBB, ElseBB)->getIterator();
+        IP = UncondBrInst::Create(CBBB, ElseBB)->getIterator();
         SplitTI->replaceUsesOfWith(CBBB, ElseBB);
       } else {
         ElseBB = IP->getParent();
diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index a848eac6f3e4c..4c8ffeb198161 100644
--- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -129,7 +129,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
         Intrinsic::type_test,
         {&Addr,
          MetadataAsValue::get(Ctx, ConstantAsMetadata::get(CaseTypeId))});
-    BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
+    CondBrInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
     BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
 
     SI->addCase(CaseTypeId, TestBB);
diff --git a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index 7a9eee56d8bb4..3e7e467fc042a 100644
--- a/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -45,10 +45,46 @@ static cl::opt<std::string> CSVFilePath(
         "add to them in the form of `f1,attr1` or `f2,attr2=str`."));
 
 static bool hasConflictingFnAttr(Attribute::AttrKind Kind, Function &F) {
-  if (Kind == Attribute::AlwaysInline)
-    return F.hasFnAttribute(Attribute::NoInline);
-  if (Kind == Attribute::NoInline)
+  switch (Kind) {
+  case Attribute::AlwaysInline:
+    return F.hasFnAttribute(Attribute::NoInline) ||
+           F.hasFnAttribute(Attribute::OptimizeNone);
+
+  case Attribute::NoInline:
     return F.hasFnAttribute(Attribute::AlwaysInline);
+
+  case Attribute::OptimizeNone:
+    return F.hasFnAttribute(Attribute::AlwaysInline) ||
+           F.hasFnAttribute(Attribute::MinSize) ||
+           F.hasFnAttribute(Attribute::OptimizeForSize) ||
+           F.hasFnAttribute(Attribute::OptimizeForDebugging);
+
+  case Attribute::MinSize:
+    return F.hasFnAttribute(Attribute::OptimizeNone) ||
+           F.hasFnAttribute(Attribute::OptimizeForDebugging);
+
+  case Attribute::OptimizeForSize:
+    return F.hasFnAttribute(Attribute::OptimizeNone) ||
+           F.hasFnAttribute(Attribute::OptimizeForDebugging);
+
+  case Attribute::OptimizeForDebugging:
+    return F.hasFnAttribute(Attribute::OptimizeNone) ||
+           F.hasFnAttribute(Attribute::MinSize) ||
+           F.hasFnAttribute(Attribute::OptimizeForSize);
+
+  default:
+    return false;
+  }
+}
+
+static void addRequiredFnAttrs(Attribute::AttrKind Kind, Function &F) {
+  if (Kind == Attribute::OptimizeNone && !F.hasFnAttribute(Attribute::NoInline))
+    F.addFnAttr(Attribute::NoInline);
+}
+
+static bool wouldRemoveRequiredFnAttr(Attribute::AttrKind Kind, Function &F) {
+  if (Kind == Attribute::NoInline && F.hasFnAttribute(Attribute::OptimizeNone))
+    return true;
   return false;
 }
 
@@ -80,12 +116,14 @@ static void forceAttributes(Function &F) {
     if (Kind == Attribute::None || F.hasFnAttribute(Kind) ||
         hasConflictingFnAttr(Kind, F))
       continue;
+    addRequiredFnAttrs(Kind, F);
     F.addFnAttr(Kind);
   }
 
   for (const auto &S : ForceRemoveAttributes) {
     auto Kind = ParseFunctionAndAttr(S);
-    if (Kind == Attribute::None || !F.hasFnAttribute(Kind))
+    if (Kind == Attribute::None || !F.hasFnAttribute(Kind) ||
+        wouldRemoveRequiredFnAttr(Kind, F))
       continue;
     F.removeFnAttr(Kind);
   }
@@ -128,6 +166,7 @@ PreservedAnalyses ForceFunctionAttrsPass::run(Module &M,
               !hasConflictingFnAttr(AttrKind, *Func)) {
             // TODO: There could be string attributes without a value, we should
             // support those, too.
+            addRequiredFnAttrs(AttrKind, *Func);
             Func->addFnAttr(AttrKind);
             Changed = true;
           } else
diff --git a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
index 5a53017f478d6..cd914db8434ef 100644
--- a/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionSpecialization.cpp
@@ -229,8 +229,8 @@ Cost InstCostVisitor::getCodeSizeSavingsForUser(Instruction *User, Value *Use,
   Cost CodeSize = 0;
   if (auto *I = dyn_cast<SwitchInst>(User)) {
     CodeSize = estimateSwitchInst(*I);
-  } else if (auto *I = dyn_cast<BranchInst>(User)) {
-    CodeSize = estimateBranchInst(*I);
+  } else if (auto *I = dyn_cast<CondBrInst>(User)) {
+    CodeSize = estimateCondBrInst(*I);
   } else {
     C = visit(*User);
     if (!C)
@@ -280,7 +280,7 @@ Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
   return estimateBasicBlocks(WorkList);
 }
 
-Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
+Cost InstCostVisitor::estimateCondBrInst(CondBrInst &I) {
   assert(LastVisited != KnownConstants.end() && "Invalid iterator!");
 
   if (I.getCondition() != LastVisited->first)
diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index ef4d8eda08f13..fa4a7181e976b 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -221,28 +221,12 @@ OutlinableRegion::findCorrespondingBlockIn(const OutlinableRegion &Other,
 static void replaceTargetsFromPHINode(BasicBlock *PHIBlock, BasicBlock *Find,
                                       BasicBlock *Replace,
                                       DenseSet<BasicBlock *> &Included) {
-  for (PHINode &PN : PHIBlock->phis()) {
-    for (unsigned Idx = 0, PNEnd = PN.getNumIncomingValues(); Idx != PNEnd;
-         ++Idx) {
+  for (PHINode &PN : PHIBlock->phis())
+    for (BasicBlock *Incoming : PN.blocks())
       // Check if the incoming block is included in the set of blocks being
       // outlined.
-      BasicBlock *Incoming = PN.getIncomingBlock(Idx);
-      if (!Included.contains(Incoming))
-        continue;
-
-      BranchInst *BI = dyn_cast<BranchInst>(Incoming->getTerminator());
-      assert(BI && "Not a branch instruction?");
-      // Look over the branching instructions into this block to see if we
-      // used to branch to Find in this outlined block.
-      for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ != End;
-           Succ++) {
-        // If we have found the block to replace, we do so here.
-        if (BI->getSuccessor(Succ) != Find)
-          continue;
-        BI->setSuccessor(Succ, Replace);
-      }
-    }
-  }
+      if (Included.contains(Incoming))
+        Incoming->getTerminator()->replaceSuccessorWith(Find, Replace);
 }
 
 
@@ -1540,23 +1524,11 @@ static BasicBlock *findOrCreatePHIBlock(OutlinableGroup &Group, Value *RetVal) {
                                             ReturnBB->getParent());
   PhiBlockForRetVal->second = PHIBlock;
 
-  // We find the predecessors of the return block in the newly created outlined
-  // function in order to point them to the new PHIBlock rather than the already
-  // existing return block.
-  SmallVector<BranchInst *, 2> BranchesToChange;
-  for (BasicBlock *Pred : predecessors(ReturnBB))
-    BranchesToChange.push_back(cast<BranchInst>(Pred->getTerminator()));
-
-  // Now we mark the branch instructions found, and change the references of the
-  // return block to the newly created PHIBlock.
-  for (BranchInst *BI : BranchesToChange)
-    for (unsigned Succ = 0, End = BI->getNumSuccessors(); Succ < End; Succ++) {
-      if (BI->getSuccessor(Succ) != ReturnBB)
-        continue;
-      BI->setSuccessor(Succ, PHIBlock);
-    }
+  // We replace all branches to the return block in the newly created outlined
+  // function to point to the new PHIBlock.
+  ReturnBB->replaceAllUsesWith(PHIBlock);
 
-  BranchInst::Create(ReturnBB, PHIBlock);
+  UncondBrInst::Create(ReturnBB, PHIBlock);
 
   return PhiBlockForRetVal->second;
 }
@@ -1969,7 +1941,7 @@ std::optional<unsigned> findDuplicateOutputBlock(
 
       BasicBlock::iterator NIt = OutputBB->begin();
       for (Instruction &I : *CompBB) {
-        if (isa<BranchInst>(&I))
+        if (isa<UncondBrInst, CondBrInst>(&I))
           continue;
 
         if (!I.isIdenticalTo(&(*NIt))) {
@@ -2084,7 +2056,7 @@ static void alignOutputBlockWithAggFunc(
     LLVM_DEBUG(dbgs() << "Create output block for region in"
                       << Region.ExtractedFunction << " to "
                       << *NewBB);
-    BranchInst::Create(VBBIt->second, NewBB);
+    UncondBrInst::Create(VBBIt->second, NewBB);
     OutputStoreBBs.back().insert(std::make_pair(RetValueForBB, NewBB));
   }
 }
@@ -2246,7 +2218,7 @@ void IROutliner::fillOverallFunction(
       DenseMap<Value *, BasicBlock *>::iterator VBBIt =
           CurrentGroup.EndBBs.find(VToBB.first);
       BasicBlock *EndBB = VBBIt->second;
-      BranchInst::Create(EndBB, VToBB.second);
+      UncondBrInst::Create(EndBB, VToBB.second);
       OutputStoreBBs.back().insert(VToBB);
     }
   }
@@ -2379,7 +2351,7 @@ void IROutliner::pruneIncompatibleRegions(
   // outlinining a call instruction, we ignore it as a space saving.
   if (FirstCandidate.getLength() == 2) {
     if (isa<CallInst>(FirstCandidate.front()->Inst) &&
-        isa<BranchInst>(FirstCandidate.back()->Inst))
+        isa<UncondBrInst, CondBrInst>(FirstCandidate.back()->Inst))
       return;
   }
 
@@ -2536,7 +2508,7 @@ static InstructionCost findCostForOutputBlocks(Module &M,
   // of the region.
   DenseSet<BasicBlock *> FoundBlocks;
   for (IRInstructionData &ID : Candidate) {
-    if (!isa<BranchInst>(ID.Inst))
+    if (!isa<UncondBrInst, CondBrInst>(ID.Inst))
       continue;
 
     for (Value *V : ID.OperVals) {
diff --git a/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index 557f1f5a73a13..52efc75d8cca0 100644
--- a/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -187,10 +187,8 @@ bool LoopExtractor::runOnFunction(Function &F) {
     bool ShouldExtractLoop = false;
 
     // Extract the loop if the entry block doesn't branch to the loop header.
-    Instruction *EntryTI = F.getEntryBlock().getTerminator();
-    if (!isa<BranchInst>(EntryTI) ||
-        !cast<BranchInst>(EntryTI)->isUnconditional() ||
-        EntryTI->getSuccessor(0) != TLL->getHeader()) {
+    auto *EntryTI = dyn_cast<UncondBrInst>(F.getEntryBlock().getTerminator());
+    if (EntryTI && EntryTI->getSuccessor() != TLL->getHeader()) {
       ShouldExtractLoop = true;
     } else {
       // Check to see if any exits from the loop are more than just return
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 84dcce749fef7..00991cf73dafa 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -784,11 +784,11 @@ Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
   // where nothing happens between the type test and the br.
   // If so, create slightly simpler IR.
   if (CI->hasOneUse())
-    if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin()))
+    if (auto *Br = dyn_cast<CondBrInst>(*CI->user_begin()))
       if (CI->getNextNode() == Br) {
         BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator());
         BasicBlock *Else = Br->getSuccessor(1);
-        BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange);
+        CondBrInst *NewBr = CondBrInst::Create(OffsetInRange, Then, Else);
         NewBr->setMetadata(LLVMContext::MD_prof,
                            Br->getMetadata(LLVMContext::MD_prof));
         ReplaceInstWithInst(InitialBB->getTerminator(), NewBr);
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 9c168c9d31b25..5266103d1ac24 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1186,7 +1186,7 @@ struct OpenMPOpt {
       cantFail(
           OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
 
-      BranchInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
+      UncondBrInst::Create(SeqAfterBB, SeqAfterIP.getBlock());
 
       LLVM_DEBUG(dbgs() << TAG << "After sequential inlining " << *OuterFn
                         << "\n");
@@ -1260,7 +1260,7 @@ struct OpenMPOpt {
           cantFail(OMPInfoCache.OMPBuilder.createParallel(
               Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB, nullptr, nullptr,
               OMP_PROC_BIND_default, /* IsCancellable */ false));
-      BranchInst::Create(AfterBB, AfterIP.getBlock());
+      UncondBrInst::Create(AfterBB, AfterIP.getBlock());
 
       // Perform the actual outlining.
       OMPInfoCache.OMPBuilder.finalize(OriginalFn);
@@ -2919,9 +2919,9 @@ struct AAExecutionDomainFunction : public AAExecutionDomain {
 
   // Check if the edge into the successor block contains a condition that only
   // lets the main thread execute it.
-  static bool isInitialThreadOnlyEdge(Attributor &A, BranchInst *Edge,
+  static bool isInitialThreadOnlyEdge(Attributor &A, CondBrInst *Edge,
                                       BasicBlock &SuccessorBB) {
-    if (!Edge || !Edge->isConditional())
+    if (!Edge)
       return false;
     if (Edge->getSuccessor(0) != &SuccessorBB)
       return false;
@@ -3128,7 +3128,7 @@ ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &A) {
         if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
           continue;
         bool InitialEdgeOnly = isInitialThreadOnlyEdge(
-            A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
+            A, dyn_cast<CondBrInst>(PredBB->getTerminator()), BB);
         mergeInPredecessor(A, ED, BEDMap[PredBB], InitialEdgeOnly);
       }
     }
@@ -4060,7 +4060,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
           OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
       Value *Ident =
           OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-      BranchInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
+      UncondBrInst::Create(RegionCheckTidBB, ParentBB)->setDebugLoc(DL);
 
       // Add check for Tid in RegionCheckTidBB
       RegionCheckTidBB->getTerminator()->eraseFromParent();
@@ -4220,7 +4220,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
                          ConstantInt::get(ThreadIdInBlock->getType(), 0),
                          "thread.is_main", InitBB);
     IsMainThread->setDebugLoc(DLoc);
-    BranchInst::Create(ReturnBB, UserCodeBB, IsMainThread, InitBB);
+    CondBrInst::Create(IsMainThread, ReturnBB, UserCodeBB, InitBB);
   }
 
   bool changeToSPMDMode(Attributor &A, ChangeStatus &Changed) {
@@ -4462,7 +4462,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
                          ConstantInt::getAllOnesValue(KernelInitCB->getType()),
                          "thread.is_worker", InitBB);
     IsWorker->setDebugLoc(DLoc);
-    BranchInst::Create(IsWorkerCheckBB, UserCodeEntryBB, IsWorker, InitBB);
+    CondBrInst::Create(IsWorker, IsWorkerCheckBB, UserCodeEntryBB, InitBB);
 
     Module &M = *Kernel->getParent();
     FunctionCallee BlockHwSizeFn =
@@ -4486,8 +4486,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
         ICmpInst::ICmp, llvm::CmpInst::ICMP_SLT, KernelInitCB, BlockSize,
         "thread.is_main_or_worker", IsWorkerCheckBB);
     IsMainOrWorker->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineBeginBB, StateMachineFinishedBB,
-                       IsMainOrWorker, IsWorkerCheckBB);
+    CondBrInst::Create(IsMainOrWorker, StateMachineBeginBB,
+                       StateMachineFinishedBB, IsWorkerCheckBB);
 
     // Create local storage for the work function pointer.
     const DataLayout &DL = M.getDataLayout();
@@ -4542,13 +4542,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
                          Constant::getNullValue(VoidPtrTy), "worker.is_done",
                          StateMachineBeginBB);
     IsDone->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineFinishedBB, StateMachineIsActiveCheckBB,
-                       IsDone, StateMachineBeginBB)
+    CondBrInst::Create(IsDone, StateMachineFinishedBB,
+                       StateMachineIsActiveCheckBB, StateMachineBeginBB)
         ->setDebugLoc(DLoc);
 
-    BranchInst::Create(StateMachineIfCascadeCurrentBB,
-                       StateMachineDoneBarrierBB, IsActiveWorker,
-                       StateMachineIsActiveCheckBB)
+    CondBrInst::Create(IsActiveWorker, StateMachineIfCascadeCurrentBB,
+                       StateMachineDoneBarrierBB, StateMachineIsActiveCheckBB)
         ->setDebugLoc(DLoc);
 
     Value *ZeroArg =
@@ -4568,7 +4567,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
           StateMachineEndParallelBB);
       CallInst::Create(ParallelRegion, {ZeroArg, GTid}, "", PRExecuteBB)
           ->setDebugLoc(DLoc);
-      BranchInst::Create(StateMachineEndParallelBB, PRExecuteBB)
+      UncondBrInst::Create(StateMachineEndParallelBB, PRExecuteBB)
           ->setDebugLoc(DLoc);
 
       BasicBlock *PRNextBB =
@@ -4590,7 +4589,7 @@ struct AAKernelInfoFunction : AAKernelInfo {
         IsPR = ConstantInt::getTrue(Ctx);
       }
 
-      BranchInst::Create(PRExecuteBB, PRNextBB, IsPR,
+      CondBrInst::Create(IsPR, PRExecuteBB, PRNextBB,
                          StateMachineIfCascadeCurrentBB)
           ->setDebugLoc(DLoc);
       StateMachineIfCascadeCurrentBB = PRNextBB;
@@ -4606,8 +4605,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
                        StateMachineIfCascadeCurrentBB)
           ->setDebugLoc(DLoc);
     }
-    BranchInst::Create(StateMachineEndParallelBB,
-                       StateMachineIfCascadeCurrentBB)
+    UncondBrInst::Create(StateMachineEndParallelBB,
+                         StateMachineIfCascadeCurrentBB)
         ->setDebugLoc(DLoc);
 
     FunctionCallee EndParallelFn =
@@ -4617,12 +4616,12 @@ struct AAKernelInfoFunction : AAKernelInfo {
         CallInst::Create(EndParallelFn, {}, "", StateMachineEndParallelBB);
     OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
     EndParallel->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
+    UncondBrInst::Create(StateMachineDoneBarrierBB, StateMachineEndParallelBB)
         ->setDebugLoc(DLoc);
 
     CallInst::Create(BarrierFn, {Ident, GTid}, "", StateMachineDoneBarrierBB)
         ->setDebugLoc(DLoc);
-    BranchInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
+    UncondBrInst::Create(StateMachineBeginBB, StateMachineDoneBarrierBB)
         ->setDebugLoc(DLoc);
 
     return true;
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 1a00d173d3ae0..9eff391a3c6b8 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -513,8 +513,8 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(
 std::unique_ptr<FunctionOutliningInfo>
 PartialInlinerImpl::computeOutliningInfo(Function &F) const {
   BasicBlock *EntryBlock = &F.front();
-  BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
-  if (!BR || BR->isUnconditional())
+  CondBrInst *BR = dyn_cast<CondBrInst>(EntryBlock->getTerminator());
+  if (!BR)
     return std::unique_ptr<FunctionOutliningInfo>();
 
   // Returns true if Succ is BB's successor
@@ -661,10 +661,8 @@ static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
     return true;
   // Now check if any of the entry block has MD_prof data:
   for (auto *E : OI.Entries) {
-    BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
-    if (!BR || BR->isUnconditional())
-      continue;
-    if (hasBranchWeightMD(*BR))
+    CondBrInst *BR = dyn_cast<CondBrInst>(E->getTerminator());
+    if (BR && hasBranchWeightMD(*BR))
       return true;
   }
   return false;
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index b2abc735046d2..e7d53a862755e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -663,7 +663,7 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // Ignore all intrinsics, phinodes and branch instructions.
   // Branch and phinodes instruction usually contains debug info from sources
   // outside of the residing basic block, thus we ignore them during annotation.
-  if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
+  if (isa<UncondBrInst, CondBrInst, IntrinsicInst, PHINode>(Inst))
     return std::error_code();
 
   // For non-CS profile, if a direct call/invoke instruction is inlined in
@@ -1687,7 +1687,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
     Instruction *TI = BB->getTerminator();
     if (TI->getNumSuccessors() == 1)
       continue;
-    if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI) &&
+    if (!isa<CondBrInst>(TI) && !isa<SwitchInst>(TI) &&
         !isa<IndirectBrInst>(TI))
       continue;
 
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index e057761256775..019b5ee6ac2f9 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -489,7 +489,7 @@ struct VirtualCallSite {
       emitRemark(OptName, TargetName, OREGetter);
     CB.replaceAllUsesWith(New);
     if (auto *II = dyn_cast<InvokeInst>(&CB)) {
-      BranchInst::Create(II->getNormalDest(), CB.getIterator());
+      UncondBrInst::Create(II->getNormalDest(), CB.getIterator());
       II->getUnwindDest()->removePredecessor(II->getParent());
     }
     CB.eraseFromParent();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 8eeeccbc86523..c781c6978b275 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2323,17 +2323,24 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
     Constant *C2;
 
     // C-(X+C2) --> (C-C2)-X
-    if (match(Op1, m_Add(m_Value(X), m_ImmConstant(C2)))) {
+    if (match(Op1, m_AddLike(m_Value(X), m_ImmConstant(C2)))) {
       // C-C2 never overflow, and C-(X+C2), (X+C2) has NSW/NUW
       // => (C-C2)-X can have NSW/NUW
       bool WillNotSOV = willNotOverflowSignedSub(C, C2, I);
       BinaryOperator *Res =
           BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
-      auto *OBO1 = cast<OverflowingBinaryOperator>(Op1);
-      Res->setHasNoSignedWrap(I.hasNoSignedWrap() && OBO1->hasNoSignedWrap() &&
-                              WillNotSOV);
-      Res->setHasNoUnsignedWrap(I.hasNoUnsignedWrap() &&
-                                OBO1->hasNoUnsignedWrap());
+
+      // or disjoint is equivalent to add nuw nsw.
+      bool Op1NSW = true;
+      bool Op1NUW = true;
+
+      if (auto *OBO1 = dyn_cast<OverflowingBinaryOperator>(Op1)) {
+        Op1NSW = OBO1->hasNoSignedWrap();
+        Op1NUW = OBO1->hasNoUnsignedWrap();
+      }
+
+      Res->setHasNoSignedWrap(I.hasNoSignedWrap() && Op1NSW && WillNotSOV);
+      Res->setHasNoUnsignedWrap(I.hasNoUnsignedWrap() && Op1NUW);
       return Res;
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 10caf4ab78a5a..dc860700db91b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -75,7 +75,7 @@ static bool subWithOverflow(APInt &Result, const APInt &In1, const APInt &In2,
 /// branch on sign bit comparison.
 static bool hasBranchUse(ICmpInst &I) {
   for (auto *U : I.users())
-    if (isa<BranchInst>(U))
+    if (isa<CondBrInst>(U))
       return true;
   return false;
 }
@@ -1419,7 +1419,7 @@ Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
     return nullptr;
   };
 
-  for (BranchInst *BI : DC.conditionsFor(X)) {
+  for (CondBrInst *BI : DC.conditionsFor(X)) {
     CmpPredicate DomPred;
     const APInt *DomC;
     if (!match(BI->getCondition(),
@@ -6836,8 +6836,8 @@ static bool isChainSelectCmpBranch(const SelectInst *SI) {
   const BasicBlock *BB = SI->getParent();
   if (!BB)
     return false;
-  auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
-  if (!BI || BI->getNumSuccessors() != 2)
+  auto *BI = dyn_cast_or_null<CondBrInst>(BB->getTerminator());
+  if (!BI)
     return false;
   auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
   if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 277f81245ade2..769ad433650b4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1571,10 +1571,9 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   if (StoreBB == DestBB || OtherBB == DestBB)
     return false;
 
-  // Verify that the other block ends in a branch and is not otherwise empty.
+  // Verify that the other block is not empty apart from the terminator.
   BasicBlock::iterator BBI(OtherBB->getTerminator());
-  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
-  if (!OtherBr || BBI == OtherBB->begin())
+  if (BBI == OtherBB->begin())
     return false;
 
   auto OtherStoreIsMergeable = [&](StoreInst *OtherStore) -> bool {
@@ -1591,7 +1590,7 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
   // If the other block ends in an unconditional branch, check for the 'if then
   // else' case. There is an instruction before the branch.
   StoreInst *OtherStore = nullptr;
-  if (OtherBr->isUnconditional()) {
+  if (isa<UncondBrInst>(BBI)) {
     --BBI;
     // Skip over debugging info and pseudo probes.
     while (BBI->isDebugOrPseudoInst()) {
@@ -1604,7 +1603,7 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
     OtherStore = dyn_cast<StoreInst>(BBI);
     if (!OtherStoreIsMergeable(OtherStore))
       return false;
-  } else {
+  } else if (auto *OtherBr = dyn_cast<CondBrInst>(BBI)) {
     // Otherwise, the other block ended with a conditional branch. If one of the
     // destinations is StoreBB, then we have the if/then case.
     if (OtherBr->getSuccessor(0) != StoreBB &&
@@ -1634,7 +1633,8 @@ bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
       if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
         return false;
     }
-  }
+  } else
+    return false;
 
   // Insert a PHI node now if we need it.
   Value *MergedVal = OtherStore->getValueOperand();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 026f05164bf5e..44bd4d0e0f6b7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1312,10 +1312,7 @@ static Value *simplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
     SuccForValue[C] = Succ;
     ++SuccCount[Succ];
   };
-  if (auto *BI = dyn_cast<BranchInst>(IDom->getTerminator())) {
-    if (BI->isUnconditional())
-      return nullptr;
-
+  if (auto *BI = dyn_cast<CondBrInst>(IDom->getTerminator())) {
     Cond = BI->getCondition();
     AddSucc(ConstantInt::getTrue(Context), BI->getSuccessor(0));
     AddSucc(ConstantInt::getFalse(Context), BI->getSuccessor(1));
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 3b034f6c37f66..6b450cc7d9b33 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1168,8 +1168,8 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
     } else {
       // If UseBB is the single successor of Pred, we can add InsertValue to
       // Pred.
-      auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
-      if (!BI || !BI->isUnconditional())
+      auto *BI = dyn_cast<UncondBrInst>(Pred->getTerminator());
+      if (!BI)
         return nullptr;
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 915cb207ee053..e1b5819a64dfe 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1446,7 +1446,7 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I, Value *IgnoredUser) {
       break;
     }
     case Instruction::CondBr: {
-      BranchInst *BI = cast<BranchInst>(U);
+      CondBrInst *BI = cast<CondBrInst>(U);
       BI->swapSuccessors(); // swaps prof metadata too
       if (BPI)
         BPI->swapSuccEdgesProbabilities(BI->getParent());
@@ -1855,9 +1855,9 @@ static Value *simplifyInstructionWithPHI(Instruction &I, PHINode *PN,
 
   // Check if incoming PHI value can be replaced with constant
   // based on implied condition.
-  BranchInst *TerminatorBI = dyn_cast<BranchInst>(InBB->getTerminator());
+  CondBrInst *TerminatorBI = dyn_cast<CondBrInst>(InBB->getTerminator());
   const ICmpInst *ICmp = dyn_cast<ICmpInst>(&I);
-  if (TerminatorBI && TerminatorBI->isConditional() &&
+  if (TerminatorBI &&
       TerminatorBI->getSuccessor(0) != TerminatorBI->getSuccessor(1) && ICmp) {
     bool LHSIsTrue = TerminatorBI->getSuccessor(0) == PN->getParent();
     std::optional<bool> ImpliedCond = isImpliedCondition(
@@ -2015,8 +2015,8 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN,
     // be inserting the computation on some other paths (e.g. inside a loop).
     // Only do this if the pred block is unconditionally branching into the phi
     // block. Also, make sure that the pred block is not dead code.
-    BranchInst *BI = dyn_cast<BranchInst>(InBB->getTerminator());
-    if (!BI || !BI->isUnconditional() || !DT.isReachableFromEntry(InBB))
+    UncondBrInst *BI = dyn_cast<UncondBrInst>(InBB->getTerminator());
+    if (!BI || !DT.isReachableFromEntry(InBB))
       return nullptr;
 
     NewPhiValues.push_back(nullptr);
@@ -2270,9 +2270,8 @@ Instruction *InstCombinerImpl::foldBinopWithPhiOperands(BinaryOperator &BO) {
   // The block that we are hoisting to must reach here unconditionally.
   // Otherwise, we could be speculatively executing an expensive or
   // non-speculative op.
-  auto *PredBlockBranch = dyn_cast<BranchInst>(OtherBB->getTerminator());
-  if (!PredBlockBranch || PredBlockBranch->isConditional() ||
-      !DT.isReachableFromEntry(OtherBB))
+  auto *PredBlockBranch = dyn_cast<UncondBrInst>(OtherBB->getTerminator());
+  if (!PredBlockBranch || !DT.isReachableFromEntry(OtherBB))
     return nullptr;
 
   // TODO: This check could be tightened to only apply to binops (div/rem) that
@@ -6080,7 +6079,7 @@ bool InstCombinerImpl::prepareWorklist(Function &F) {
     // If this is a branch or switch on a constant, mark only the single
     // live successor. Otherwise assume all successors are live.
     Instruction *TI = BB->getTerminator();
-    if (BranchInst *BI = dyn_cast<BranchInst>(TI); BI && BI->isConditional()) {
+    if (CondBrInst *BI = dyn_cast<CondBrInst>(TI)) {
       if (isa<UndefValue>(BI->getCondition())) {
         // Branch on undef is UB.
         HandleOnlyLiveSuccessor(BB, nullptr);
@@ -6146,11 +6145,11 @@ bool InstCombinerImpl::prepareWorklist(Function &F) {
 
 void InstCombiner::computeBackEdges() {
   // Collect backedges.
-  SmallPtrSet<BasicBlock *, 16> Visited;
+  SmallVector<bool> Visited(F.getMaxBlockNumber());
   for (BasicBlock *BB : RPOT) {
-    Visited.insert(BB);
+    Visited[BB->getNumber()] = true;
     for (BasicBlock *Succ : successors(BB))
-      if (Visited.contains(Succ))
+      if (Visited[Succ->getNumber()])
         BackEdges.insert({BB, Succ});
   }
   ComputedBackEdges = true;
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 1fd9e37c69975..36d9b34e4383e 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1997,8 +1997,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
     // path is rarely taken. This seems to be the case for SPEC benchmarks.
     Instruction *CheckTerm = SplitBlockAndInsertIfThen(
         Cmp, InsertBefore, false, MDBuilder(*C).createUnlikelyBranchWeights());
-    assert(cast<BranchInst>(CheckTerm)->isUnconditional());
-    BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+    BasicBlock *NextBB = cast<UncondBrInst>(CheckTerm)->getSuccessor();
     IRB.SetInsertPoint(CheckTerm);
     Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeStoreSize);
     if (Recover) {
@@ -2007,7 +2006,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
       BasicBlock *CrashBlock =
         BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
       CrashTerm = new UnreachableInst(*C, CrashBlock);
-      BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+      CondBrInst *NewTerm = CondBrInst::Create(Cmp2, CrashBlock, NextBB);
       ReplaceInstWithInst(CheckTerm, NewTerm);
     }
   } else {
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 9c1d0daeff99f..fb7e58f4632ef 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -167,12 +167,12 @@ static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
     // If we have a constant zero, unconditionally branch.
     // FIXME: We should really handle this differently to bypass the splitting
     // the block.
-    BranchInst::Create(TrapBB, OldBB);
+    UncondBrInst::Create(TrapBB, OldBB);
     return;
   }
 
   // Create the conditional branch.
-  BranchInst::Create(TrapBB, Cont, Or, OldBB);
+  CondBrInst::Create(Or, TrapBB, Cont, OldBB);
 }
 
 static std::string
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 11e028f1feb3b..faf0e7debb3c1 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -342,12 +342,12 @@ class CHR {
                         BasicBlock *ExitBlock,
                         Region *LastRegion,
                         ValueToValueMapTy &VMap);
-  BranchInst *createMergedBranch(BasicBlock *PreEntryBlock,
+  CondBrInst *createMergedBranch(BasicBlock *PreEntryBlock,
                                  BasicBlock *EntryBlock,
                                  BasicBlock *NewEntryBlock,
                                  ValueToValueMapTy &VMap);
   void fixupBranchesAndSelects(CHRScope *Scope, BasicBlock *PreEntryBlock,
-                               BranchInst *MergedBR, uint64_t ProfileCount);
+                               CondBrInst *MergedBR, uint64_t ProfileCount);
   void fixupBranch(Region *R, CHRScope *Scope, IRBuilder<> &IRB,
                    Value *&MergedCondition, BranchProbability &CHRBranchBias);
   void fixupSelect(SelectInst *SI, CHRScope *Scope, IRBuilder<> &IRB,
@@ -624,12 +624,11 @@ static bool checkBias(K *Key, BranchProbability TrueProb,
 
 // Returns true and insert a region into the right biased set and the map if the
 // branch of the region is biased.
-static bool checkBiasedBranch(BranchInst *BI, Region *R,
-                              DenseSet<Region *> &TrueBiasedRegionsGlobal,
-                              DenseSet<Region *> &FalseBiasedRegionsGlobal,
-                              DenseMap<Region *, BranchProbability> &BranchBiasMap) {
-  if (!BI->isConditional())
-    return false;
+static bool
+checkBiasedBranch(CondBrInst *BI, Region *R,
+                  DenseSet<Region *> &TrueBiasedRegionsGlobal,
+                  DenseSet<Region *> &FalseBiasedRegionsGlobal,
+                  DenseMap<Region *, BranchProbability> &BranchBiasMap) {
   BranchProbability ThenProb, ElseProb;
   if (!extractBranchProbabilities(BI, ThenProb, ElseProb))
     return false;
@@ -774,12 +773,8 @@ CHRScope * CHR::findScope(Region *R) {
     // if (cond) {
     //  ...
     // }
-    auto *BI = dyn_cast<BranchInst>(Entry->getTerminator());
-    if (BI)
-      CHR_DEBUG(dbgs() << "BI.isConditional " << BI->isConditional() << "\n");
-    else
-      CHR_DEBUG(dbgs() << "BI null\n");
-    if (BI && BI->isConditional()) {
+    if (auto *BI = dyn_cast<CondBrInst>(Entry->getTerminator())) {
+      CHR_DEBUG(dbgs() << "BI conditional\n");
       BasicBlock *S0 = BI->getSuccessor(0);
       BasicBlock *S1 = BI->getSuccessor(1);
       CHR_DEBUG(dbgs() << "S0 " << S0->getName() << "\n");
@@ -890,8 +885,8 @@ void CHR::checkScopeHoistable(CHRScope *Scope) {
   RegInfo &RI = Scope->RegInfos[0];
   Region *R = RI.R;
   BasicBlock *EntryBB = R->getEntry();
-  auto *Branch = RI.HasBranch ?
-                 cast<BranchInst>(EntryBB->getTerminator()) : nullptr;
+  auto *Branch =
+      RI.HasBranch ? cast<CondBrInst>(EntryBB->getTerminator()) : nullptr;
   SmallVector<SelectInst *, 8> &Selects = RI.Selects;
   if (RI.HasBranch || !Selects.empty()) {
     Instruction *InsertPoint = getBranchInsertPoint(RI);
@@ -1038,7 +1033,7 @@ CHRScope * CHR::findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
 static DenseSet<Value *> getCHRConditionValuesForRegion(RegInfo &RI) {
   DenseSet<Value *> ConditionValues;
   if (RI.HasBranch) {
-    auto *BI = cast<BranchInst>(RI.R->getEntry()->getTerminator());
+    auto *BI = cast<CondBrInst>(RI.R->getEntry()->getTerminator());
     ConditionValues.insert(BI->getCondition());
   }
   for (SelectInst *SI : RI.Selects) {
@@ -1395,7 +1390,7 @@ void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
       assert((OutermostScope->TrueBiasedRegions.contains(R) ||
               OutermostScope->FalseBiasedRegions.contains(R)) &&
              "Must be truthy or falsy");
-      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      auto *BI = cast<CondBrInst>(R->getEntry()->getTerminator());
       // Note checkHoistValue fills in HoistStops.
       DenseMap<Instruction *, bool> Visited;
       bool IsHoistable = checkHoistValue(BI->getCondition(), InsertPoint, DT,
@@ -1495,7 +1490,7 @@ static void hoistScopeConditions(CHRScope *Scope, Instruction *HoistPoint,
     bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
     bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
     if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
-      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      auto *BI = cast<CondBrInst>(R->getEntry()->getTerminator());
       hoistValue(BI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
                  HoistedSet, TrivialPHIs, DT);
     }
@@ -1518,7 +1513,7 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
   for (User *U : ICmp->users()) {
     if (U == ExcludedUser)
       continue;
-    if (isa<BranchInst>(U) && cast<BranchInst>(U)->isConditional())
+    if (isa<CondBrInst>(U))
       continue;
     if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == ICmp)
       continue;
@@ -1527,8 +1522,7 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
   for (User *U : ICmp->users()) {
     if (U == ExcludedUser)
       continue;
-    if (auto *BI = dyn_cast<BranchInst>(U)) {
-      assert(BI->isConditional() && "Must be conditional");
+    if (auto *BI = dyn_cast<CondBrInst>(U)) {
       BI->swapSuccessors();
       // Don't need to swap this in terms of
       // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based
@@ -1671,7 +1665,7 @@ assertBranchOrSelectConditionHoisted(CHRScope *Scope,
     bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
     bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
     if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
-      auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+      auto *BI = cast<CondBrInst>(R->getEntry()->getTerminator());
       Value *V = BI->getCondition();
       CHR_DEBUG(dbgs() << *V << "\n");
       if (auto *I = dyn_cast<Instruction>(V)) {
@@ -1775,8 +1769,8 @@ void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
 
   // Replace the old (placeholder) branch with the new (merged) conditional
   // branch.
-  BranchInst *MergedBr = createMergedBranch(PreEntryBlock, EntryBlock,
-                                            NewEntryBlock, VMap);
+  CondBrInst *MergedBr =
+      createMergedBranch(PreEntryBlock, EntryBlock, NewEntryBlock, VMap);
 
 #ifndef NDEBUG
   assertCHRRegionsHaveBiasedBranchOrSelect(Scope);
@@ -1857,12 +1851,12 @@ void CHR::cloneScopeBlocks(CHRScope *Scope,
 
 // A helper for transformScope. Replace the old (placeholder) branch with the
 // new (merged) conditional branch.
-BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
+CondBrInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
                                     BasicBlock *EntryBlock,
                                     BasicBlock *NewEntryBlock,
                                     ValueToValueMapTy &VMap) {
-  BranchInst *OldBR = cast<BranchInst>(PreEntryBlock->getTerminator());
-  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == NewEntryBlock &&
+  UncondBrInst *OldBR = cast<UncondBrInst>(PreEntryBlock->getTerminator());
+  assert(OldBR->getSuccessor() == NewEntryBlock &&
          "SplitBlock did not work correctly!");
   assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
          "NewEntryBlock's only pred must be EntryBlock");
@@ -1872,9 +1866,9 @@ BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
   OldBR->eraseFromParent();
   // The true predicate is a placeholder. It will be replaced later in
   // fixupBranchesAndSelects().
-  BranchInst *NewBR = BranchInst::Create(NewEntryBlock,
-                                         cast<BasicBlock>(VMap[NewEntryBlock]),
-                                         ConstantInt::getTrue(F.getContext()));
+  CondBrInst *NewBR =
+      CondBrInst::Create(ConstantInt::getTrue(F.getContext()), NewEntryBlock,
+                         cast<BasicBlock>(VMap[NewEntryBlock]));
   NewBR->insertInto(PreEntryBlock, PreEntryBlock->end());
   assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
          "NewEntryBlock's only pred must be EntryBlock");
@@ -1883,10 +1877,8 @@ BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
 
 // A helper for transformScopes. Create the combined branch condition and
 // constant-fold the branches/selects in the hot path.
-void CHR::fixupBranchesAndSelects(CHRScope *Scope,
-                                  BasicBlock *PreEntryBlock,
-                                  BranchInst *MergedBR,
-                                  uint64_t ProfileCount) {
+void CHR::fixupBranchesAndSelects(CHRScope *Scope, BasicBlock *PreEntryBlock,
+                                  CondBrInst *MergedBR, uint64_t ProfileCount) {
   Value *MergedCondition = ConstantInt::getTrue(F.getContext());
   BranchProbability CHRBranchBias(1, 1);
   uint64_t NumCHRedBranches = 0;
@@ -1932,7 +1924,7 @@ void CHR::fixupBranch(Region *R, CHRScope *Scope,
   bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
   assert((IsTrueBiased || Scope->FalseBiasedRegions.count(R)) &&
          "Must be truthy or falsy");
-  auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+  auto *BI = cast<CondBrInst>(R->getEntry()->getTerminator());
   assert(BranchBiasMap.contains(R) && "Must be in the bias map");
   BranchProbability Bias = BranchBiasMap[R];
   assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 894e2e6b4a0a9..648df1f545f01 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -836,7 +836,7 @@ class DFSanVisitor : public InstVisitor<DFSanVisitor> {
   void visitSelectInst(SelectInst &I);
   void visitMemSetInst(MemSetInst &I);
   void visitMemTransferInst(MemTransferInst &I);
-  void visitBranchInst(BranchInst &BR);
+  void visitCondBrInst(CondBrInst &BR);
   void visitSwitchInst(SwitchInst &SW);
 
 private:
@@ -1790,7 +1790,7 @@ bool DataFlowSanitizer::runImpl(
         Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(V, Pos);
         Value *Ne =
             IRB.CreateICmpNE(PrimitiveShadow, DFSF.DFS.ZeroPrimitiveShadow);
-        BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+        UncondBrInst *BI = cast<UncondBrInst>(SplitBlockAndInsertIfThen(
             Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
         IRBuilder<> ThenIRB(BI);
         ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
@@ -2976,10 +2976,7 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
   }
 }
 
-void DFSanVisitor::visitBranchInst(BranchInst &BR) {
-  if (!BR.isConditional())
-    return;
-
+void DFSanVisitor::visitCondBrInst(CondBrInst &BR) {
   DFSF.addConditionalCallbacksIfEnabled(BR, BR.getCondition());
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index f7f7a26d39dbd..2fbdc3adb32d6 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1123,8 +1123,8 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
   }
   IRB.CreateCall(Asm, TCI.PtrLong);
   if (Recover)
-    cast<BranchInst>(CheckFailTerm)
-        ->setSuccessor(0, TCI.TagMismatchTerm->getParent());
+    cast<UncondBrInst>(CheckFailTerm)
+        ->setSuccessor(TCI.TagMismatchTerm->getParent());
 }
 
 bool HWAddressSanitizer::ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE,
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 991dd2e2d4db7..f7d383a26dcab 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2620,15 +2620,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   //    S = (S1 & S2) | (V1 & S2) | (S1 & V2)
   Value *handleBitwiseAnd(IRBuilder<> &IRB, Value *V1, Value *V2, Value *S1,
                           Value *S2) {
+    // "The two arguments to the ‘and’ instruction must be integer or vector
+    //  of integer values. Both arguments must have identical types."
+    //
+    // We enforce this condition for all callers to handleBitwiseAnd(); callers
+    // with non-integer types should call CreateAppToShadowCast() themselves.
+    assert(V1->getType()->isIntOrIntVectorTy());
+    assert(V1->getType() == V2->getType());
+
+    // Conveniently, getShadowTy() of Int/IntVector returns the original type.
+    assert(V1->getType() == S1->getType());
+    assert(V2->getType() == S2->getType());
+
     Value *S1S2 = IRB.CreateAnd(S1, S2);
     Value *V1S2 = IRB.CreateAnd(V1, S2);
     Value *S1V2 = IRB.CreateAnd(S1, V2);
 
-    if (V1->getType() != S1->getType()) {
-      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
-      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
-    }
-
     return IRB.CreateOr({S1S2, V1S2, S1V2});
   }
 
@@ -2662,10 +2669,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S2 = getShadow(&I, 1);
     Value *V1 = I.getOperand(0);
     Value *V2 = I.getOperand(1);
-    if (V1->getType() != S1->getType()) {
-      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
-      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
-    }
+
+    // "The two arguments to the ‘or’ instruction must be integer or vector
+    //  of integer values. Both arguments must have identical types."
+    assert(V1->getType()->isIntOrIntVectorTy());
+    assert(V1->getType() == V2->getType());
+
+    // Conveniently, getShadowTy() of Int/IntVector returns the original type.
+    assert(V1->getType() == S1->getType());
+    assert(V2->getType() == S2->getType());
 
     Value *NotV1 = IRB.CreateNot(V1);
     Value *NotV2 = IRB.CreateNot(V2);
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index 1581ae4f3ce7e..5f895ad33d065 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -1398,14 +1398,11 @@ const KnownIntrinsic::WidenedIntrinsic KnownIntrinsic::kWidenedIntrinsics[] = {
     {"llvm.log2.f64", Intrinsic::log2, makeX86FP80X86FP80},
     {"llvm.log2.f80", Intrinsic::log2, makeX86FP80X86FP80},
     {"llvm.fma.f32", Intrinsic::fma, makeDoubleDoubleDoubleDouble},
-
-    {"llvm.fmuladd.f32", Intrinsic::fmuladd, makeDoubleDoubleDoubleDouble},
-
     {"llvm.fma.f64", Intrinsic::fma, makeX86FP80X86FP80X86FP80X86FP80},
-
-    {"llvm.fmuladd.f64", Intrinsic::fma, makeX86FP80X86FP80X86FP80X86FP80},
-
     {"llvm.fma.f80", Intrinsic::fma, makeX86FP80X86FP80X86FP80X86FP80},
+    {"llvm.fmuladd.f32", Intrinsic::fmuladd, makeDoubleDoubleDoubleDouble},
+    {"llvm.fmuladd.f64", Intrinsic::fmuladd, makeX86FP80X86FP80X86FP80X86FP80},
+    {"llvm.fmuladd.f80", Intrinsic::fmuladd, makeX86FP80X86FP80X86FP80X86FP80},
     {"llvm.fabs.f32", Intrinsic::fabs, makeDoubleDouble},
     {"llvm.fabs.f64", Intrinsic::fabs, makeX86FP80X86FP80},
     {"llvm.fabs.f80", Intrinsic::fabs, makeX86FP80X86FP80},
@@ -1438,22 +1435,10 @@ const KnownIntrinsic::WidenedIntrinsic KnownIntrinsic::kWidenedIntrinsics[] = {
     {"llvm.rint.f80", Intrinsic::rint, makeX86FP80X86FP80},
     {"llvm.nearbyint.f32", Intrinsic::nearbyint, makeDoubleDouble},
     {"llvm.nearbyint.f64", Intrinsic::nearbyint, makeX86FP80X86FP80},
-    {"llvm.nearbyin80f64", Intrinsic::nearbyint, makeX86FP80X86FP80},
+    {"llvm.nearbyint.f80", Intrinsic::nearbyint, makeX86FP80X86FP80},
     {"llvm.round.f32", Intrinsic::round, makeDoubleDouble},
     {"llvm.round.f64", Intrinsic::round, makeX86FP80X86FP80},
     {"llvm.round.f80", Intrinsic::round, makeX86FP80X86FP80},
-    {"llvm.lround.f32", Intrinsic::lround, makeDoubleDouble},
-    {"llvm.lround.f64", Intrinsic::lround, makeX86FP80X86FP80},
-    {"llvm.lround.f80", Intrinsic::lround, makeX86FP80X86FP80},
-    {"llvm.llround.f32", Intrinsic::llround, makeDoubleDouble},
-    {"llvm.llround.f64", Intrinsic::llround, makeX86FP80X86FP80},
-    {"llvm.llround.f80", Intrinsic::llround, makeX86FP80X86FP80},
-    {"llvm.lrint.f32", Intrinsic::lrint, makeDoubleDouble},
-    {"llvm.lrint.f64", Intrinsic::lrint, makeX86FP80X86FP80},
-    {"llvm.lrint.f80", Intrinsic::lrint, makeX86FP80X86FP80},
-    {"llvm.llrint.f32", Intrinsic::llrint, makeDoubleDouble},
-    {"llvm.llrint.f64", Intrinsic::llrint, makeX86FP80X86FP80},
-    {"llvm.llrint.f80", Intrinsic::llrint, makeX86FP80X86FP80},
 };
 
 const KnownIntrinsic::LFEntry KnownIntrinsic::kLibfuncIntrinsics[] = {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 02f06bebb8f0d..0232d45e5b7bb 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -407,8 +407,8 @@ class FunctionInstrumenter final {
 // Return a string describing the branch condition that can be
 // used in static branch probability heuristics:
 static std::string getBranchCondString(Instruction *TI) {
-  BranchInst *BI = dyn_cast<BranchInst>(TI);
-  if (!BI || !BI->isConditional())
+  CondBrInst *BI = dyn_cast<CondBrInst>(TI);
+  if (!BI)
     return std::string();
 
   Value *Cond = BI->getCondition();
@@ -1692,7 +1692,7 @@ void PGOUseFunc::setBranchWeights() {
     Instruction *TI = BB.getTerminator();
     if (TI->getNumSuccessors() < 2)
       continue;
-    if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
+    if (!(isa<CondBrInst>(TI) || isa<SwitchInst>(TI) ||
           isa<IndirectBrInst>(TI) || isa<InvokeInst>(TI) ||
           isa<CallBrInst>(TI)))
       continue;
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 7f061e2259711..df9675a02824e 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -35,6 +35,7 @@
 #include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
 using namespace llvm;
@@ -43,6 +44,8 @@ using namespace llvm;
 
 const char SanCovTracePCIndirName[] = "__sanitizer_cov_trace_pc_indir";
 const char SanCovTracePCName[] = "__sanitizer_cov_trace_pc";
+const char SanCovTracePCEntryName[] = "__sanitizer_cov_trace_pc_entry";
+const char SanCovTracePCExitName[] = "__sanitizer_cov_trace_pc_exit";
 const char SanCovTraceCmp1[] = "__sanitizer_cov_trace_cmp1";
 const char SanCovTraceCmp2[] = "__sanitizer_cov_trace_cmp2";
 const char SanCovTraceCmp4[] = "__sanitizer_cov_trace_cmp4";
@@ -99,6 +102,10 @@ static cl::opt<int> ClCoverageLevel(
 static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
                                cl::desc("Experimental pc tracing"), cl::Hidden);
 
+static cl::opt<bool> ClTracePCEntryExit(
+    "sanitizer-coverage-trace-pc-entry-exit",
+    cl::desc("pc tracing with separate entry/exit callbacks"), cl::Hidden);
+
 static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
                                     cl::desc("pc tracing with a guard"),
                                     cl::Hidden);
@@ -208,6 +215,7 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.TraceDiv |= ClDIVTracing;
   Options.TraceGep |= ClGEPTracing;
   Options.TracePC |= ClTracePC;
+  Options.TracePCEntryExit |= ClTracePCEntryExit;
   Options.TracePCGuard |= ClTracePCGuard;
   Options.Inline8bitCounters |= ClInline8bitCounters;
   Options.InlineBoolFlag |= ClInlineBoolFlag;
@@ -219,7 +227,7 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.TraceLoads |= ClLoadTracing;
   Options.TraceStores |= ClStoreTracing;
   Options.GatedCallbacks |= ClGatedCallbacks;
-  if (!Options.TracePCGuard && !Options.TracePC &&
+  if (!Options.TracePCGuard && !Options.TracePC && !Options.TracePCEntryExit &&
       !Options.Inline8bitCounters && !Options.StackDepth &&
       !Options.InlineBoolFlag && !Options.TraceLoads && !Options.TraceStores)
     Options.TracePCGuard = true; // TracePCGuard is default.
@@ -256,6 +264,7 @@ class ModuleSanitizerCoverage {
                          ArrayRef<GetElementPtrInst *> GepTraceTargets);
   void InjectTraceForLoadsAndStores(Function &F, ArrayRef<LoadInst *> Loads,
                                     ArrayRef<StoreInst *> Stores);
+  void InjectTraceForExits(Function &F);
   void InjectTraceForSwitch(Function &F,
                             ArrayRef<Instruction *> SwitchTraceTargets,
                             Value *&FunctionGateCmp);
@@ -288,6 +297,7 @@ class ModuleSanitizerCoverage {
   FunctionCallee SanCovStackDepthCallback;
   FunctionCallee SanCovTracePCIndir;
   FunctionCallee SanCovTracePC, SanCovTracePCGuard;
+  FunctionCallee SanCovTracePCEntry, SanCovTracePCExit;
   std::array<FunctionCallee, 4> SanCovTraceCmpFunction;
   std::array<FunctionCallee, 4> SanCovTraceConstCmpFunction;
   std::array<FunctionCallee, 5> SanCovLoadFunction;
@@ -527,6 +537,8 @@ bool ModuleSanitizerCoverage::instrumentModule() {
   }
 
   SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
+  SanCovTracePCEntry = M.getOrInsertFunction(SanCovTracePCEntryName, VoidTy);
+  SanCovTracePCExit = M.getOrInsertFunction(SanCovTracePCExitName, VoidTy);
   SanCovTracePCGuard =
       M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, PtrTy);
 
@@ -646,7 +658,7 @@ static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree &DT,
                              const SanitizerCoverageOptions &Options) {
   if (!Options.NoPrune)
     if (CMP->hasOneUse())
-      if (auto BR = dyn_cast<BranchInst>(CMP->user_back()))
+      if (auto BR = dyn_cast<CondBrInst>(CMP->user_back()))
         for (BasicBlock *B : BR->successors())
           if (IsBackEdge(BR->getParent(), B, DT))
             return false;
@@ -752,6 +764,9 @@ void ModuleSanitizerCoverage::instrumentFunction(Function &F) {
   InjectTraceForDiv(F, DivTraceTargets);
   InjectTraceForGep(F, GepTraceTargets);
   InjectTraceForLoadsAndStores(F, Loads, Stores);
+
+  if (Options.TracePCEntryExit)
+    InjectTraceForExits(F);
 }
 
 GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
@@ -864,6 +879,7 @@ bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
   CreateFunctionLocalArrays(F, AllBlocks);
   for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
     InjectCoverageAtBlock(F, *AllBlocks[i], i, FunctionGateCmp, IsLeafFunc);
+
   return true;
 }
 
@@ -878,7 +894,7 @@ void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
     Function &F, ArrayRef<Instruction *> IndirCalls) {
   if (IndirCalls.empty())
     return;
-  assert(Options.TracePC || Options.TracePCGuard ||
+  assert(Options.TracePC || Options.TracePCEntryExit || Options.TracePCGuard ||
          Options.Inline8bitCounters || Options.InlineBoolFlag);
   for (auto *I : IndirCalls) {
     InstrumentationIRBuilder IRB(I);
@@ -997,6 +1013,15 @@ void ModuleSanitizerCoverage::InjectTraceForLoadsAndStores(
   }
 }
 
+void ModuleSanitizerCoverage::InjectTraceForExits(Function &F) {
+  EscapeEnumerator EE(F, "sancov_exit");
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    InstrumentationIRBuilder::ensureDebugInfo(*AtExit, F);
+    AtExit->CreateCall(SanCovTracePCExit, {})
+        ->setTailCallKind(CallInst::TCK_NoTail);
+  }
+}
+
 void ModuleSanitizerCoverage::InjectTraceForCmp(
     Function &F, ArrayRef<Instruction *> CmpTraceTargets,
     Value *&FunctionGateCmp) {
@@ -1062,8 +1087,11 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   InstrumentationIRBuilder IRB(&*IP);
   if (EntryLoc)
     IRB.SetCurrentDebugLocation(EntryLoc);
-  if (Options.TracePC) {
-    IRB.CreateCall(SanCovTracePC)
+  if (Options.TracePC || (IsEntryBB && Options.TracePCEntryExit)) {
+    FunctionCallee Callee = IsEntryBB && Options.TracePCEntryExit
+                                ? SanCovTracePCEntry
+                                : SanCovTracePC;
+    IRB.CreateCall(Callee)
         ->setCannotMerge(); // gets the PC using GET_CALLER_PC.
   }
   if (Options.TracePCGuard) {
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 386e48f81a93f..69c91b4327e5b 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -1654,20 +1654,18 @@ ComputePostOrders(Function &F,
   BasicBlock *EntryBB = &F.getEntryBlock();
   BBState &MyStates = BBStates[EntryBB];
   MyStates.SetAsEntry();
-  Instruction *EntryTI = EntryBB->getTerminator();
-  SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+  SuccStack.push_back(std::make_pair(EntryBB, succ_begin(EntryBB)));
   Visited.insert(EntryBB);
   OnStack.insert(EntryBB);
   do {
   dfs_next_succ:
     BasicBlock *CurrBB = SuccStack.back().first;
-    succ_iterator SE(CurrBB->getTerminator(), false);
+    succ_iterator SE = succ_end(CurrBB->getTerminator());
 
     while (SuccStack.back().second != SE) {
       BasicBlock *SuccBB = *SuccStack.back().second++;
       if (Visited.insert(SuccBB).second) {
-        SuccStack.push_back(
-            std::make_pair(SuccBB, succ_iterator(SuccBB->getTerminator())));
+        SuccStack.push_back(std::make_pair(SuccBB, succ_begin(SuccBB)));
         BBStates[CurrBB].addSucc(SuccBB);
         BBState &SuccStates = BBStates[SuccBB];
         SuccStates.addPred(CurrBB);
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 985b9c0e53125..5eef1bb209c03 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -212,11 +212,6 @@ ADCEChanged AggressiveDeadCodeElimination::performDeadCodeElimination() {
   return removeDeadInstructions();
 }
 
-static bool isUnconditionalBranch(Instruction *Term) {
-  auto *BR = dyn_cast<BranchInst>(Term);
-  return BR && BR->isUnconditional();
-}
-
 void AggressiveDeadCodeElimination::initialize() {
   auto NumBlocks = F.size();
 
@@ -232,7 +227,7 @@ void AggressiveDeadCodeElimination::initialize() {
     auto &Info = BlockInfo[&BB];
     Info.BB = &BB;
     Info.Terminator = BB.getTerminator();
-    Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator);
+    Info.UnconditionalBranch = isa<UncondBrInst>(Info.Terminator);
   }
 
   // Initialize instruction map and set pointers to block info.
@@ -339,7 +334,7 @@ bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
   }
   if (!I.isTerminator())
     return false;
-  if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I)))
+  if (RemoveControlFlowFlag && isa<UncondBrInst, CondBrInst, SwitchInst>(I))
     return false;
   return true;
 }
@@ -682,8 +677,8 @@ void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
     collectLiveScopes(*DL);
 
   // Just mark live an existing unconditional branch
-  if (isUnconditionalBranch(PredTerm)) {
-    PredTerm->setSuccessor(0, Target);
+  if (auto *BI = dyn_cast<UncondBrInst>(PredTerm)) {
+    BI->setSuccessor(Target);
     InstInfo[PredTerm].Live = true;
     return;
   }
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 2e955e061111f..ce73c44959a0e 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -128,8 +128,8 @@ using ConditionsTy = SmallVector<ConditionTy, 2>;
 /// if it is relevant to any argument at CB.
 static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To,
                             ConditionsTy &Conditions) {
-  auto *BI = dyn_cast<BranchInst>(From->getTerminator());
-  if (!BI || !BI->isConditional())
+  auto *BI = dyn_cast<CondBrInst>(From->getTerminator());
+  if (!BI)
     return;
 
   CmpPredicate Pred;
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index b854e7902e900..c457ff74610ce 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -956,9 +956,9 @@ void State::addInfoForInductions(BasicBlock &BB) {
 
   BasicBlock *InLoopSucc = nullptr;
   if (Pred == CmpInst::ICMP_NE)
-    InLoopSucc = cast<BranchInst>(BB.getTerminator())->getSuccessor(0);
+    InLoopSucc = cast<CondBrInst>(BB.getTerminator())->getSuccessor(0);
   else if (Pred == CmpInst::ICMP_EQ)
-    InLoopSucc = cast<BranchInst>(BB.getTerminator())->getSuccessor(1);
+    InLoopSucc = cast<CondBrInst>(BB.getTerminator())->getSuccessor(1);
   else
     return;
 
@@ -1248,8 +1248,8 @@ void State::addInfoFor(BasicBlock &BB) {
     return;
   }
 
-  auto *Br = dyn_cast<BranchInst>(BB.getTerminator());
-  if (!Br || !Br->isConditional())
+  auto *Br = dyn_cast<CondBrInst>(BB.getTerminator());
+  if (!Br)
     return;
 
   Value *Cond = Br->getCondition();
diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index 2f1f59c1ff2a8..e3e6c106da0a9 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -207,18 +207,16 @@ void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
   assert(SI->hasOneUse());
   // The select may come indirectly, instead of from where it is defined.
   BasicBlock *StartBlock = SIUse->getIncomingBlock(*SI->use_begin());
-  BranchInst *StartBlockTerm =
-      dyn_cast<BranchInst>(StartBlock->getTerminator());
-  assert(StartBlockTerm);
 
-  if (StartBlockTerm->isUnconditional()) {
+  if (UncondBrInst *StartBlockTerm =
+          dyn_cast<UncondBrInst>(StartBlock->getTerminator())) {
     BasicBlock *EndBlock = StartBlock->getUniqueSuccessor();
     // Arbitrarily choose the 'false' side for a new input value to the PHI.
     BasicBlock *NewBlock = BasicBlock::Create(
         SI->getContext(), Twine(SI->getName(), ".si.unfold.false"),
         EndBlock->getParent(), EndBlock);
     NewBBs->push_back(NewBlock);
-    BranchInst::Create(EndBlock, NewBlock);
+    UncondBrInst::Create(EndBlock, NewBlock);
     DTU->applyUpdates({{DominatorTree::Insert, NewBlock, EndBlock}});
 
     // StartBlock
@@ -268,7 +266,7 @@ void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
     // Insert the real conditional branch based on the original condition.
     StartBlockTerm->eraseFromParent();
     auto *BI =
-        BranchInst::Create(EndBlock, NewBlock, SI->getCondition(), StartBlock);
+        CondBrInst::Create(SI->getCondition(), EndBlock, NewBlock, StartBlock);
     if (!ProfcheckDisableMetadataFixes)
       BI->setMetadata(LLVMContext::MD_prof,
                       SI->getMetadata(LLVMContext::MD_prof));
@@ -303,10 +301,10 @@ void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
     //   |     /
     // EndBlock
     //  (Use)
-    BranchInst::Create(EndBlock, NewBlockF);
+    UncondBrInst::Create(EndBlock, NewBlockF);
     // Insert the real conditional branch based on the original condition.
     auto *BI =
-        BranchInst::Create(EndBlock, NewBlockF, SI->getCondition(), NewBlockT);
+        CondBrInst::Create(SI->getCondition(), EndBlock, NewBlockF, NewBlockT);
     if (!ProfcheckDisableMetadataFixes)
       BI->setMetadata(LLVMContext::MD_prof,
                       SI->getMetadata(LLVMContext::MD_prof));
@@ -346,8 +344,9 @@ void DFAJumpThreading::unfold(DomTreeUpdater *DTU, LoopInfo *LI,
 
     // Update the appropriate successor of the start block to point to the new
     // unfolded block.
-    unsigned SuccNum = StartBlockTerm->getSuccessor(1) == EndBlock ? 1 : 0;
-    StartBlockTerm->setSuccessor(SuccNum, NewBlockT);
+    CondBrInst *CondBr = cast<CondBrInst>(StartBlock->getTerminator());
+    unsigned SuccNum = CondBr->getSuccessor(1) == EndBlock ? 1 : 0;
+    CondBr->setSuccessor(SuccNum, NewBlockT);
     DTU->applyUpdates({{DominatorTree::Delete, StartBlock, EndBlock},
                        {DominatorTree::Insert, StartBlock, NewBlockT}});
   }
@@ -549,8 +548,8 @@ struct MainSwitch {
 
     // Currently, we can only expand select instructions in basic blocks with
     // one successor.
-    BranchInst *SITerm = dyn_cast<BranchInst>(SIBB->getTerminator());
-    if (!SITerm || !SITerm->isUnconditional())
+    UncondBrInst *SITerm = dyn_cast<UncondBrInst>(SIBB->getTerminator());
+    if (!SITerm)
       return false;
 
     // Only fold the select coming from directly where it is defined.
@@ -1339,10 +1338,9 @@ struct TransformDFA {
     for (auto Entry : VMap) {
       Instruction *Inst =
           dyn_cast<Instruction>(const_cast<Value *>(Entry.first));
-      if (!Inst || !Entry.second || isa<BranchInst>(Inst) ||
-          isa<SwitchInst>(Inst)) {
+      if (!Inst || !Entry.second ||
+          isa<UncondBrInst, CondBrInst, SwitchInst>(Inst))
         continue;
-      }
 
       Instruction *Cloned = dyn_cast<Instruction>(Entry.second);
       if (!Cloned)
@@ -1389,7 +1387,7 @@ struct TransformDFA {
     }
 
     Switch->eraseFromParent();
-    BranchInst::Create(NextCase, LastBlock);
+    UncondBrInst::Create(NextCase, LastBlock);
 
     DTU->applyUpdates(DTUpdates);
   }
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 85d7618b3239c..2714074dddafc 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -2256,8 +2256,8 @@ bool DSEState::dominatingConditionImpliesValue(MemoryDef *Def) {
   if (!IDom)
     return false;
 
-  auto *BI = dyn_cast<BranchInst>(IDom->getBlock()->getTerminator());
-  if (!BI || !BI->isConditional())
+  auto *BI = dyn_cast<CondBrInst>(IDom->getBlock()->getTerminator());
+  if (!BI)
     return false;
 
   // In case both blocks are the same, it is not possible to determine
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index e30f3060c27a6..ff3d55e5747e5 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -937,7 +937,7 @@ class EarlyCSE {
 
   bool processNode(DomTreeNode *Node);
 
-  bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
+  bool handleBranchCondition(Instruction *CondInst, const CondBrInst *BI,
                              const BasicBlock *BB, const BasicBlock *Pred);
 
   Value *getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
@@ -1166,9 +1166,8 @@ bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
 }
 
 bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
-                                     const BranchInst *BI, const BasicBlock *BB,
+                                     const CondBrInst *BI, const BasicBlock *BB,
                                      const BasicBlock *Pred) {
-  assert(BI->isConditional() && "Should be a conditional branch!");
   assert(BI->getCondition() == CondInst && "Wrong condition?");
   assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
   auto *TorF = (BI->getSuccessor(0) == BB)
@@ -1358,8 +1357,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // value.  Since we're adding this to the scoped hash table (like any other
   // def), it will have been popped if we encounter a future merge block.
   if (BasicBlock *Pred = BB->getSinglePredecessor()) {
-    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
-    if (BI && BI->isConditional()) {
+    if (auto *BI = dyn_cast<CondBrInst>(Pred->getTerminator())) {
       auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
       if (CondInst && SimpleValue::canHandle(CondInst))
         Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index c18cafd75a0a5..7cab4be169123 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -2704,10 +2704,7 @@ bool GVNPass::processInstruction(Instruction *I) {
 
   // For conditional branches, we can perform simple conditional propagation on
   // the condition value itself.
-  if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
-    if (!BI->isConditional())
-      return false;
-
+  if (CondBrInst *BI = dyn_cast<CondBrInst>(I)) {
     if (isa<Constant>(BI->getCondition()))
       return processFoldableCondBr(BI);
 
@@ -3309,10 +3306,7 @@ void GVNPass::addDeadBlock(BasicBlock *BB) {
 //     dead blocks with "UndefVal" in an hope these PHIs will optimized away.
 //
 // Return true iff *NEW* dead code are found.
-bool GVNPass::processFoldableCondBr(BranchInst *BI) {
-  if (!BI || BI->isUnconditional())
-    return false;
-
+bool GVNPass::processFoldableCondBr(CondBrInst *BI) {
   // If a branch has two identical successors, we cannot declare either dead.
   if (BI->getSuccessor(0) == BI->getSuccessor(1))
     return false;
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 4dddb017a98ee..b371684416b0f 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -730,7 +730,7 @@ unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
     if (!RPOTOrder.count(B))
       return 0;
     auto *T = B->getTerminator();
-    if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+    if (isa<UncondBrInst, CondBrInst, SwitchInst>(T))
       Preds.push_back(B);
     else
       return 0;
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index ddb99a5af6089..9967221651472 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -87,7 +87,7 @@ static Value *getCondition(Instruction *I) {
   if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB))
     return Cond;
 
-  return cast<BranchInst>(I)->getCondition();
+  return cast<CondBrInst>(I)->getCondition();
 }
 
 // Set the condition for \p I to \p NewCond. \p I can either be a guard or a
@@ -99,7 +99,7 @@ static void setCondition(Instruction *I, Value *NewCond) {
     GI->setArgOperand(0, NewCond);
     return;
   }
-  cast<BranchInst>(I)->setCondition(NewCond);
+  cast<CondBrInst>(I)->setCondition(NewCond);
 }
 
 // Eliminates the guard instruction properly.
@@ -311,7 +311,7 @@ class GuardWideningImpl {
                                               getCondition(ToWiden), *InsertPt);
 
     if (isGuardAsWidenableBranch(ToWiden)) {
-      setWidenableBranchCond(cast<BranchInst>(ToWiden), Result);
+      setWidenableBranchCond(cast<CondBrInst>(ToWiden), Result);
       return;
     }
     setCondition(ToWiden, Result);
@@ -364,7 +364,7 @@ bool GuardWideningImpl::run() {
       if (isSupportedGuardInstruction(I))
         eliminateGuard(I, MSSAU);
       else {
-        assert(isa<BranchInst>(I) &&
+        assert(isa<CondBrInst>(I) &&
                "Eliminated something other than guard or branch?");
         ++CondBranchEliminated;
       }
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 745880f1ca0cb..c92efadded635 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -305,11 +305,10 @@ maybeFloatingPointRecurrence(Loop *L, PHINode *PN) {
   // of the loop.  If not, the new IV can overflow and no one will notice.
   // The branch block must be in the loop and one of the successors must be out
   // of the loop.
-  auto *BI = dyn_cast<BranchInst>(Compare->user_back());
+  auto *BI = dyn_cast<CondBrInst>(Compare->user_back());
   if (!BI)
     return std::nullopt;
 
-  assert(BI->isConditional() && "Can't use fcmp if not conditional");
   if (!L->contains(BI->getParent()) ||
       (L->contains(BI->getSuccessor(0)) && L->contains(BI->getSuccessor(1))))
     return std::nullopt;
@@ -432,7 +431,7 @@ static void canonicalizeToIntegerIV(Loop *L, PHINode *PN,
 
   IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
   auto *Incr = cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
-  auto *BI = cast<BranchInst>(FPIV.Compare->user_back());
+  auto *BI = cast<CondBrInst>(FPIV.Compare->user_back());
 
   LLVM_DEBUG(dbgs() << "INDVARS: Rewriting floating-point IV to integer IV:\n"
                     << "   Init: " << IIV.InitValue << "\n"
@@ -572,7 +571,7 @@ bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
         auto *TermInst = IncomingBB->getTerminator();
 
         Value *Cond = nullptr;
-        if (auto *BI = dyn_cast<BranchInst>(TermInst)) {
+        if (auto *BI = dyn_cast<CondBrInst>(TermInst)) {
           // Must be a conditional branch, otherwise the block
           // should not be in the loop.
           Cond = BI->getCondition();
@@ -807,7 +806,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) {
 /// Whether the current loop exit test is based on this value.  Currently this
 /// is limited to a direct use in the loop condition.
 static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) {
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  CondBrInst *BI = cast<CondBrInst>(ExitingBB->getTerminator());
   ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
   // TODO: Allow non-icmp loop test.
   if (!ICmp)
@@ -826,7 +825,7 @@ static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) {
   // test.  This is critical for when SCEV's cached ExitCount is less precise
   // than the current IR (such as after we've proven a particular exit is
   // actually dead and thus the BE count never reaches our ExitCount.)
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  CondBrInst *BI = cast<CondBrInst>(ExitingBB->getTerminator());
   if (L->isLoopInvariant(BI->getCondition()))
     return false;
 
@@ -941,7 +940,7 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
                                 ScalarEvolution *SE, DominatorTree *DT) {
   uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
 
-  Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition();
+  Value *Cond = cast<CondBrInst>(ExitingBB->getTerminator())->getCondition();
 
   // Loop over all of the PHI nodes, looking for a simple counter.
   PHINode *BestPhi = nullptr;
@@ -1118,7 +1117,7 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
   }
 
   // Insert a new icmp_ne or icmp_eq instruction before the branch.
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  CondBrInst *BI = cast<CondBrInst>(ExitingBB->getTerminator());
   ICmpInst::Predicate P;
   if (L->contains(BI->getSuccessor(0)))
     P = ICmpInst::ICMP_NE;
@@ -1276,7 +1275,7 @@ bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
   return MadeAnyChanges;
 }
 
-static void replaceExitCond(BranchInst *BI, Value *NewCond,
+static void replaceExitCond(CondBrInst *BI, Value *NewCond,
                             SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   auto *OldCond = BI->getCondition();
   LLVM_DEBUG(dbgs() << "Replacing condition of loop-exiting branch " << *BI
@@ -1288,7 +1287,7 @@ static void replaceExitCond(BranchInst *BI, Value *NewCond,
 
 static Constant *createFoldedExitCond(const Loop *L, BasicBlock *ExitingBB,
                                       bool IsTaken) {
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  CondBrInst *BI = cast<CondBrInst>(ExitingBB->getTerminator());
   bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
   auto *OldCond = BI->getCondition();
   return ConstantInt::get(OldCond->getType(),
@@ -1297,7 +1296,7 @@ static Constant *createFoldedExitCond(const Loop *L, BasicBlock *ExitingBB,
 
 static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken,
                      SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  CondBrInst *BI = cast<CondBrInst>(ExitingBB->getTerminator());
   auto *NewCond = createFoldedExitCond(L, ExitingBB, IsTaken);
   replaceExitCond(BI, NewCond, DeadInsts);
 }
@@ -1354,7 +1353,7 @@ createInvariantCond(const Loop *L, BasicBlock *ExitingBB,
   if (ExitIfTrue)
     InvariantPred = ICmpInst::getInversePredicate(InvariantPred);
   IRBuilder<> Builder(Preheader->getTerminator());
-  BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  CondBrInst *BI = cast<CondBrInst>(ExitingBB->getTerminator());
   return Builder.CreateICmp(InvariantPred, LHSV, RHSV,
                             BI->getCondition()->getName());
 }
@@ -1368,7 +1367,7 @@ createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB,
   Value *RHS = ICmp->getOperand(1);
 
   // 'LHS pred RHS' should now mean that we stay in loop.
-  auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+  auto *BI = cast<CondBrInst>(ExitingBB->getTerminator());
   if (Inverted)
     Pred = ICmpInst::getInverseCmpPredicate(Pred);
 
@@ -1418,7 +1417,7 @@ createReplacement(ICmpInst *ICmp, const Loop *L, BasicBlock *ExitingBB,
 }
 
 static bool optimizeLoopExitWithUnknownExitCount(
-    const Loop *L, BranchInst *BI, BasicBlock *ExitingBB, const SCEV *MaxIter,
+    const Loop *L, CondBrInst *BI, BasicBlock *ExitingBB, const SCEV *MaxIter,
     bool SkipLastIter, ScalarEvolution *SE, SCEVExpander &Rewriter,
     SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
   assert(
@@ -1535,10 +1534,9 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
   L->getExitingBlocks(ExitingBlocks);
   bool Changed = false;
   for (auto *ExitingBB : ExitingBlocks) {
-    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    auto *BI = dyn_cast<CondBrInst>(ExitingBB->getTerminator());
     if (!BI)
       continue;
-    assert(BI->isConditional() && "exit branch must be conditional");
 
     auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
     if (!ICmp || !ICmp->hasOneUse())
@@ -1580,10 +1578,9 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) {
   // Now that we've canonicalized the condition to match the extend,
   // see if we can rotate the extend out of the loop.
   for (auto *ExitingBB : ExitingBlocks) {
-    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    auto *BI = dyn_cast<CondBrInst>(ExitingBB->getTerminator());
     if (!BI)
       continue;
-    assert(BI->isConditional() && "exit branch must be conditional");
 
     auto *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
     if (!ICmp || !ICmp->hasOneUse() || !ICmp->isUnsigned())
@@ -1671,7 +1668,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
       return true;
 
     // Can't rewrite non-branch yet.
-    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    CondBrInst *BI = dyn_cast<CondBrInst>(ExitingBB->getTerminator());
     if (!BI)
       return true;
 
@@ -1743,7 +1740,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     if (isa<SCEVCouldNotCompute>(ExactExitCount)) {
       // Okay, we do not know the exit count here. Can we at least prove that it
       // will remain the same within iteration space?
-      auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+      auto *BI = cast<CondBrInst>(ExitingBB->getTerminator());
       auto OptimizeCond = [&](bool SkipLastIter) {
         return optimizeLoopExitWithUnknownExitCount(L, BI, ExitingBB,
                                                     MaxBECount, SkipLastIter,
@@ -1879,7 +1876,7 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
       return true;
 
     // Can't rewrite non-branch yet.
-    BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    CondBrInst *BI = dyn_cast<CondBrInst>(ExitingBB->getTerminator());
     if (!BI)
       return true;
 
@@ -2006,7 +2003,7 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   for (BasicBlock *ExitingBB : ExitingBlocks) {
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
 
-    auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+    auto *BI = cast<CondBrInst>(ExitingBB->getTerminator());
     if (HasThreadLocalSideEffects) {
       const BasicBlock *Unreachable = nullptr;
       for (const BasicBlock *Succ : BI->successors()) {
@@ -2135,7 +2132,7 @@ bool IndVarSimplify::run(Loop *L) {
     L->getExitingBlocks(ExitingBlocks);
     for (BasicBlock *ExitingBB : ExitingBlocks) {
       // Can't rewrite non-branch yet.
-      if (!isa<BranchInst>(ExitingBB->getTerminator()))
+      if (!isa<CondBrInst>(ExitingBB->getTerminator()))
         continue;
 
       // If our exitting block exits multiple loops, we can only rewrite the
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 308c267969113..98da1e9225172 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -226,7 +226,7 @@ class InductiveRangeCheck {
   /// NB! There may be conditions feeding into \p BI that aren't inductive range
   /// checks, and hence don't end up in \p Checks.
   static void extractRangeChecksFromBranch(
-      BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+      CondBrInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
       std::optional<uint64_t> EstimatedTripCount,
       SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed);
 };
@@ -516,10 +516,10 @@ void InductiveRangeCheck::extractRangeChecksFromCond(
 }
 
 void InductiveRangeCheck::extractRangeChecksFromBranch(
-    BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+    CondBrInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
     std::optional<uint64_t> EstimatedTripCount,
     SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed) {
-  if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+  if (BI->getParent() == L->getLoopLatch())
     return;
 
   unsigned IndexLoopSucc = L->contains(BI->getSuccessor(0)) ? 0 : 1;
@@ -972,7 +972,7 @@ InductiveRangeCheckElimination::estimatedTripCount(const Loop &L) {
   auto *Latch = L.getLoopLatch();
   if (!Latch)
     return std::nullopt;
-  auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+  auto *LatchBr = dyn_cast<CondBrInst>(Latch->getTerminator());
   if (!LatchBr)
     return std::nullopt;
 
@@ -1012,7 +1012,7 @@ bool InductiveRangeCheckElimination::run(
   bool Changed = false;
 
   for (auto *BBI : L->getBlocks())
-    if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+    if (CondBrInst *TBI = dyn_cast<CondBrInst>(BBI->getTerminator()))
       InductiveRangeCheck::extractRangeChecksFromBranch(
           TBI, L, SE, BPI, EstimatedTripCount, RangeChecks, Changed);
 
diff --git a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
index f926bafbe8b2c..a3e3b9a207ca1 100644
--- a/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpTableToSwitch.cpp
@@ -181,7 +181,7 @@ expandToSwitch(CallBase *CB, const JumpTableTy &JT, DomTreeUpdater &DTU,
     // just some of the jump targets are taken (for the given profile).
     BranchWeights.push_back(FctID == 0U ? 0U
                                         : GuidToCounter.lookup_or(FctID, 0U));
-    BranchInst::Create(Tail, B);
+    UncondBrInst::Create(Tail, B);
     if (PHI)
       PHI->addIncoming(Call, B);
   }
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index e8b4ba5f5106f..7f8e10eb201a6 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -146,7 +146,7 @@ JumpThreadingPass::JumpThreadingPass(int T) {
 //  that P(t == true) is also unlikely.
 //
 static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
-  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  CondBrInst *CondBr = dyn_cast<CondBrInst>(BB->getTerminator());
   if (!CondBr)
     return;
 
@@ -169,8 +169,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
     auto *SuccBB = PhiBB;
     SmallPtrSet<BasicBlock *, 16> Visited;
     while (true) {
-      BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
-      if (PredBr && PredBr->isConditional())
+      if (isa<CondBrInst>(PredBB->getTerminator()))
         return {PredBB, SuccBB};
       Visited.insert(PredBB);
       auto *SinglePredBB = PredBB->getSinglePredecessor();
@@ -205,7 +204,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
       return;
 
     BasicBlock *PredBB = PredOutEdge.first;
-    BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+    CondBrInst *PredBr = dyn_cast<CondBrInst>(PredBB->getTerminator());
     if (!PredBr)
       return;
 
@@ -351,9 +350,8 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_,
 
       // processBlock doesn't thread BBs with unconditional TIs. However, if BB
       // is "almost empty", we attempt to merge BB with its sole successor.
-      auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
-      if (BI && BI->isUnconditional()) {
-        BasicBlock *Succ = BI->getSuccessor(0);
+      if (auto *BI = dyn_cast<UncondBrInst>(BB.getTerminator())) {
+        BasicBlock *Succ = BI->getSuccessor();
         if (
             // The terminator must be the only non-phi instruction in BB.
             BB.getFirstNonPHIOrDbg(true)->isTerminator() &&
@@ -971,9 +969,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
   // branch, if not we can't thread it.
   Value *Condition;
   Instruction *Terminator = BB->getTerminator();
-  if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
-    // Can't thread an unconditional jump.
-    if (BI->isUnconditional()) return false;
+  if (CondBrInst *BI = dyn_cast<CondBrInst>(Terminator)) {
     Condition = BI->getCondition();
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
     Condition = SI->getCondition();
@@ -1023,7 +1019,8 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
 
     LLVM_DEBUG(dbgs() << "  In block '" << BB->getName()
                       << "' folding undef terminator: " << *BBTerm << '\n');
-    Instruction *NewBI = BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm->getIterator());
+    Instruction *NewBI = UncondBrInst::Create(BBTerm->getSuccessor(BestSucc),
+                                              BBTerm->getIterator());
     NewBI->setDebugLoc(BBTerm->getDebugLoc());
     ++NumFolds;
     BBTerm->eraseFromParent();
@@ -1112,7 +1109,7 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
 
   // Before threading, try to propagate profile data backwards:
   if (PHINode *PN = dyn_cast<PHINode>(CondInst))
-    if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+    if (PN->getParent() == BB && isa<CondBrInst>(BB->getTerminator()))
       updatePredecessorProfileMetadata(PN, BB);
 
   // Handle a variety of cases where we are branching on something derived from
@@ -1124,12 +1121,12 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
   // If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in
   // the current block, see if we can simplify.
   PHINode *PN = dyn_cast<PHINode>(CondWithoutFreeze);
-  if (PN && PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+  if (PN && PN->getParent() == BB && isa<CondBrInst>(BB->getTerminator()))
     return processBranchOnPHI(PN);
 
   // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
   if (CondInst->getOpcode() == Instruction::Xor &&
-      CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+      CondInst->getParent() == BB && isa<CondBrInst>(BB->getTerminator()))
     return processBranchOnXOR(cast<BinaryOperator>(CondInst));
 
   // Search for a stronger dominating condition that can be used to simplify a
@@ -1141,8 +1138,8 @@ bool JumpThreadingPass::processBlock(BasicBlock *BB) {
 }
 
 bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
-  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
-  if (!BI || !BI->isConditional())
+  auto *BI = dyn_cast<CondBrInst>(BB->getTerminator());
+  if (!BI)
     return false;
 
   Value *Cond = BI->getCondition();
@@ -1164,8 +1161,8 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
   auto &DL = BB->getDataLayout();
 
   while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
-    auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
-    if (!PBI || !PBI->isConditional())
+    auto *PBI = dyn_cast<CondBrInst>(CurrentPred->getTerminator());
+    if (!PBI)
       return false;
     if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
       return false;
@@ -1186,7 +1183,8 @@ bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
       BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
       BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
       RemoveSucc->removePredecessor(BB);
-      BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI->getIterator());
+      UncondBrInst *UncondBI =
+          UncondBrInst::Create(KeepSucc, BI->getIterator());
       UncondBI->setDebugLoc(BI->getDebugLoc());
       ++NumFolds;
       BI->eraseFromParent();
@@ -1605,7 +1603,7 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB,
     BasicBlock *DestBB;
     if (isa<UndefValue>(Val))
       DestBB = nullptr;
-    else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    else if (CondBrInst *BI = dyn_cast<CondBrInst>(BB->getTerminator())) {
       assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
@@ -1662,7 +1660,7 @@ bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB,
 
       // Finally update the terminator.
       Instruction *Term = BB->getTerminator();
-      Instruction *NewBI = BranchInst::Create(OnlyDest, Term->getIterator());
+      Instruction *NewBI = UncondBrInst::Create(OnlyDest, Term->getIterator());
       NewBI->setDebugLoc(Term->getDebugLoc());
       ++NumFolds;
       Term->eraseFromParent();
@@ -1755,13 +1753,12 @@ bool JumpThreadingPass::processBranchOnPHI(PHINode *PN) {
   // to br(icmp(freeze ...)).
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PredBB = PN->getIncomingBlock(i);
-    if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
-      if (PredBr->isUnconditional()) {
-        PredBBs[0] = PredBB;
-        // Try to duplicate BB into PredBB.
-        if (duplicateCondBranchOnPHIIntoPred(BB, PredBBs))
-          return true;
-      }
+    if (isa<UncondBrInst>(PredBB->getTerminator())) {
+      PredBBs[0] = PredBB;
+      // Try to duplicate BB into PredBB.
+      if (duplicateCondBranchOnPHIIntoPred(BB, PredBBs))
+        return true;
+    }
   }
 
   return false;
@@ -2142,7 +2139,7 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
   // PredBB.  Then we can thread edges PredBB1->BB and PredBB2->BB through BB.
 
   // Require that BB end with a Branch for simplicity.
-  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  CondBrInst *CondBr = dyn_cast<CondBrInst>(BB->getTerminator());
   if (!CondBr)
     return false;
 
@@ -2154,8 +2151,8 @@ bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
   // Require that PredBB end with a conditional Branch. If PredBB ends with an
   // unconditional branch, we should be merging PredBB and BB instead. For
   // simplicity, we don't deal with a switch.
-  BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
-  if (!PredBBBranch || PredBBBranch->isUnconditional())
+  CondBrInst *PredBBBranch = dyn_cast<CondBrInst>(PredBB->getTerminator());
+  if (!PredBBBranch)
     return false;
 
   // If PredBB has exactly one incoming edge, we don't gain anything by copying
@@ -2276,8 +2273,8 @@ void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
   auto *BFI = getOrCreateBFI(HasProfile);
   auto *BPI = getOrCreateBPI(BFI != nullptr);
 
-  BranchInst *CondBr = cast<BranchInst>(BB->getTerminator());
-  BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator());
+  CondBrInst *CondBr = cast<CondBrInst>(BB->getTerminator());
+  CondBrInst *PredBBBranch = cast<CondBrInst>(PredBB->getTerminator());
 
   BasicBlock *NewBB =
       BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread",
@@ -2429,7 +2426,7 @@ void JumpThreadingPass::threadEdge(BasicBlock *BB,
 
   // We didn't copy the terminator from BB over to NewBB, because there is now
   // an unconditional jump to SuccBB.  Insert the unconditional jump.
-  BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
+  UncondBrInst *NewBI = UncondBrInst::Create(SuccBB, NewBB);
   NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
 
   // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
@@ -2550,10 +2547,10 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
   // Collect updated outgoing edges' frequencies from BB and use them to update
   // edge probabilities.
   SmallVector<uint64_t, 4> BBSuccFreq;
-  for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
-    auto BB2SuccBBFreq =
-        BBOrigFreq * BPI->getEdgeProbability(BB, I.getSuccessorIndex());
-    auto SuccFreq = (*I == SuccBB) ? BB2SuccBBFreq - NewBBFreq : BB2SuccBBFreq;
+  for (auto It : enumerate(successors(BB))) {
+    auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, It.index());
+    auto SuccFreq =
+        (It.value() == SuccBB) ? BB2SuccBBFreq - NewBBFreq : BB2SuccBBFreq;
     BBSuccFreq.push_back(SuccFreq.getFrequency());
   }
 
@@ -2667,15 +2664,15 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
 
   // Unless PredBB ends with an unconditional branch, split the edge so that we
   // can just clone the bits from BB into the end of the new PredBB.
-  BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+  UncondBrInst *OldPredBranch = dyn_cast<UncondBrInst>(PredBB->getTerminator());
 
-  if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
+  if (!OldPredBranch) {
     BasicBlock *OldPredBB = PredBB;
     PredBB = SplitEdge(OldPredBB, BB);
     Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
     Updates.push_back({DominatorTree::Insert, PredBB, BB});
     Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
-    OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+    OldPredBranch = cast<UncondBrInst>(PredBB->getTerminator());
   }
 
   // We are going to have to map operands from the original BB block into the
@@ -2738,7 +2735,7 @@ bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
 
   // Check to see if the targets of the branch had PHI nodes. If so, we need to
   // add entries to the PHI nodes for branch from PredBB now.
-  BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
+  CondBrInst *BBBranch = cast<CondBrInst>(BB->getTerminator());
   addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
                                   ValueMapping);
   addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
@@ -2781,14 +2778,14 @@ void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
   //  |-----
   //  v
   // BB
-  BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
+  UncondBrInst *PredTerm = cast<UncondBrInst>(Pred->getTerminator());
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
                                          BB->getParent(), BB);
   // Move the unconditional branch to NewBB.
   PredTerm->removeFromParent();
   PredTerm->insertInto(NewBB, NewBB->end());
   // Create a conditional branch and update PHI nodes.
-  auto *BI = BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+  auto *BI = CondBrInst::Create(SI->getCondition(), NewBB, BB, Pred);
   BI->applyMergedLocation(PredTerm->getDebugLoc(), SI->getDebugLoc());
   BI->copyMetadata(*SI, {LLVMContext::MD_prof});
   SIUse->setIncomingValue(Idx, SI->getFalseValue());
@@ -2848,8 +2845,8 @@ bool JumpThreadingPass::tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
     if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse())
       continue;
 
-    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
-    if (!PredTerm || !PredTerm->isUnconditional())
+    UncondBrInst *PredTerm = dyn_cast<UncondBrInst>(Pred->getTerminator());
+    if (!PredTerm)
       continue;
 
     unfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
@@ -2871,12 +2868,11 @@ bool JumpThreadingPass::tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
 /// And expand the select into a branch structure if one of its arms allows %c
 /// to be folded. This later enables threading from bb1 over bb2.
 bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
-  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  CondBrInst *CondBr = dyn_cast<CondBrInst>(BB->getTerminator());
   PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
   Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
 
-  if (!CondBr || !CondBr->isConditional() || !CondLHS ||
-      CondLHS->getParent() != BB)
+  if (!CondBr || !CondLHS || CondLHS->getParent() != BB)
     return false;
 
   for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
@@ -2888,8 +2884,8 @@ bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
     if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
       continue;
 
-    BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
-    if (!PredTerm || !PredTerm->isUnconditional())
+    UncondBrInst *PredTerm = dyn_cast<UncondBrInst>(Pred->getTerminator());
+    if (!PredTerm)
       continue;
 
     // Now check if one of the select values would allow us to constant fold the
@@ -3057,7 +3053,7 @@ bool JumpThreadingPass::processGuards(BasicBlock *BB) {
   if (!Parent || Parent != Pred2->getSinglePredecessor())
     return false;
 
-  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+  if (auto *BI = dyn_cast<CondBrInst>(Parent->getTerminator()))
     for (auto &I : *BB)
       if (isGuard(&I) && threadGuard(BB, cast<IntrinsicInst>(&I), BI))
         return true;
@@ -3069,9 +3065,7 @@ bool JumpThreadingPass::processGuards(BasicBlock *BB) {
 /// to one of its branches, in case if diamond's condition implies guard's
 /// condition.
 bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
-                                    BranchInst *BI) {
-  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
-  assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+                                    CondBrInst *BI) {
   Value *GuardCond = Guard->getArgOperand(0);
   Value *BranchCond = BI->getCondition();
   BasicBlock *TrueDest = BI->getSuccessor(0);
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 1964aa72da55e..859bc4cf83898 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -665,17 +665,16 @@ class ControlFlowHoister {
 
   // The branches that we can hoist, mapped to the block that marks a
   // convergence point of their control flow.
-  DenseMap<BranchInst *, BasicBlock *> HoistableBranches;
+  DenseMap<CondBrInst *, BasicBlock *> HoistableBranches;
 
 public:
   ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop,
                      MemorySSAUpdater &MSSAU)
       : LI(LI), DT(DT), CurLoop(CurLoop), MSSAU(MSSAU) {}
 
-  void registerPossiblyHoistableBranch(BranchInst *BI) {
+  void registerPossiblyHoistableBranch(CondBrInst *BI) {
     // We can only hoist conditional branches with loop invariant operands.
-    if (!ControlFlowHoisting || !BI->isConditional() ||
-        !CurLoop->hasLoopInvariantOperands(BI))
+    if (!ControlFlowHoisting || !CurLoop->hasLoopInvariantOperands(BI))
       return;
 
     // The branch destinations need to be in the loop, and we don't gain
@@ -775,7 +774,7 @@ class ControlFlowHoister {
 
     // Check if this block is conditional based on a pending branch
     auto HasBBAsSuccessor =
-        [&](DenseMap<BranchInst *, BasicBlock *>::value_type &Pair) {
+        [&](DenseMap<CondBrInst *, BasicBlock *>::value_type &Pair) {
           return BB != Pair.second && (Pair.first->getSuccessor(0) == BB ||
                                        Pair.first->getSuccessor(1) == BB);
         };
@@ -791,7 +790,7 @@ class ControlFlowHoister {
       HoistDestinationMap[BB] = InitialPreheader;
       return InitialPreheader;
     }
-    BranchInst *BI = It->first;
+    CondBrInst *BI = It->first;
     assert(std::none_of(std::next(It), HoistableBranches.end(),
                         HasBBAsSuccessor) &&
            "BB is expected to be the target of at most one branch");
@@ -830,15 +829,15 @@ class ControlFlowHoister {
       BasicBlock *TargetSucc = HoistTarget->getSingleSuccessor();
       assert(TargetSucc && "Expected hoist target to have a single successor");
       HoistCommonSucc->moveBefore(TargetSucc);
-      BranchInst::Create(TargetSucc, HoistCommonSucc);
+      UncondBrInst::Create(TargetSucc, HoistCommonSucc);
     }
     if (!HoistTrueDest->getTerminator()) {
       HoistTrueDest->moveBefore(HoistCommonSucc);
-      BranchInst::Create(HoistCommonSucc, HoistTrueDest);
+      UncondBrInst::Create(HoistCommonSucc, HoistTrueDest);
     }
     if (!HoistFalseDest->getTerminator()) {
       HoistFalseDest->moveBefore(HoistCommonSucc);
-      BranchInst::Create(HoistCommonSucc, HoistFalseDest);
+      UncondBrInst::Create(HoistCommonSucc, HoistFalseDest);
     }
 
     // If BI is being cloned to what was originally the preheader then
@@ -861,7 +860,7 @@ class ControlFlowHoister {
 
     // Now finally clone BI.
     auto *NewBI =
-        BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition(),
+        CondBrInst::Create(BI->getCondition(), HoistTrueDest, HoistFalseDest,
                            HoistTarget->getTerminator()->getIterator());
     HoistTarget->getTerminator()->eraseFromParent();
     // md_prof should also come from the original branch - since the
@@ -1008,7 +1007,7 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
 
       // Remember possibly hoistable branches so we can actually hoist them
       // later if needed.
-      if (BranchInst *BI = dyn_cast<BranchInst>(&I))
+      if (CondBrInst *BI = dyn_cast<CondBrInst>(&I))
         CFH.registerPossiblyHoistableBranch(BI);
     }
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
index 691f544a2779c..a461e1f7fe074 100644
--- a/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopBoundSplit.cpp
@@ -27,7 +27,7 @@ using namespace PatternMatch;
 namespace {
 struct ConditionInfo {
   /// Branch instruction with this condition
-  BranchInst *BI = nullptr;
+  CondBrInst *BI = nullptr;
   /// ICmp instruction with this condition
   ICmpInst *ICmp = nullptr;
   /// Preciate info
@@ -156,7 +156,7 @@ static bool hasProcessableCondition(const Loop &L, ScalarEvolution &SE,
 }
 
 static bool isProcessableCondBI(const ScalarEvolution &SE,
-                                const BranchInst *BI) {
+                                const CondBrInst *BI) {
   BasicBlock *TrueSucc = nullptr;
   BasicBlock *FalseSucc = nullptr;
   Value *LHS, *RHS;
@@ -201,7 +201,7 @@ static bool canSplitLoopBound(const Loop &L, const DominatorTree &DT,
   if (!ExitingBB)
     return false;
 
-  BranchInst *ExitingBI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+  CondBrInst *ExitingBI = dyn_cast<CondBrInst>(ExitingBB->getTerminator());
   if (!ExitingBI)
     return false;
 
@@ -218,7 +218,7 @@ static bool canSplitLoopBound(const Loop &L, const DominatorTree &DT,
   return true;
 }
 
-static bool isProfitableToTransform(const Loop &L, const BranchInst *BI) {
+static bool isProfitableToTransform(const Loop &L, const CondBrInst *BI) {
   // If the conditional branch splits a loop into two halves, we could
   // generally say it is profitable.
   //
@@ -238,7 +238,7 @@ static bool isProfitableToTransform(const Loop &L, const BranchInst *BI) {
   return true;
 }
 
-static BranchInst *findSplitCandidate(const Loop &L, ScalarEvolution &SE,
+static CondBrInst *findSplitCandidate(const Loop &L, ScalarEvolution &SE,
                                       ConditionInfo &ExitingCond,
                                       ConditionInfo &SplitCandidateCond) {
   for (auto *BB : L.blocks()) {
@@ -246,7 +246,7 @@ static BranchInst *findSplitCandidate(const Loop &L, ScalarEvolution &SE,
     if (L.getLoopLatch() == BB)
       continue;
 
-    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    auto *BI = dyn_cast<CondBrInst>(BB->getTerminator());
     if (!BI)
       continue;
 
@@ -415,8 +415,8 @@ static bool splitLoopBound(Loop &L, DominatorTree &DT, LoopInfo &LI,
   SplitCandidateCond.BI->setCondition(ConstantInt::getTrue(Context));
 
   // Replace cloned SplitCandidateCond.BI's condition in post-loop by False.
-  BranchInst *ClonedSplitCandidateBI =
-      cast<BranchInst>(VMap[SplitCandidateCond.BI]);
+  CondBrInst *ClonedSplitCandidateBI =
+      cast<CondBrInst>(VMap[SplitCandidateCond.BI]);
   ClonedSplitCandidateBI->setCondition(ConstantInt::getFalse(Context));
 
   // Replace exit branch target of pre-loop by post-loop's preheader.
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index ab292e833852f..00eac14845c57 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -218,7 +218,7 @@ class InstPartition {
           if (!VMap.empty())
             NewInst = cast<Instruction>(VMap[NewInst]);
 
-          assert(!isa<BranchInst>(NewInst) &&
+          assert((!isa<UncondBrInst, CondBrInst>(NewInst)) &&
                  "Branches are marked used early on");
           Unused.push_back(NewInst);
         }
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 2796482f97b73..e48c47f1b4b89 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -132,9 +132,9 @@ struct FlattenInfo {
 
   BinaryOperator *InnerIncrement = nullptr;  // Uses of induction variables in
   BinaryOperator *OuterIncrement = nullptr;  // loop control statements that
-  BranchInst *InnerBranch = nullptr;         // are safe to ignore.
+  CondBrInst *InnerBranch = nullptr;         // are safe to ignore.
 
-  BranchInst *OuterBranch = nullptr; // The instruction that needs to be
+  CondBrInst *OuterBranch = nullptr; // The instruction that needs to be
                                      // updated with new tripcount.
 
   SmallPtrSet<PHINode *, 4> InnerPHIsToTransform;
@@ -319,10 +319,10 @@ setLoopComponents(Value *&TC, Value *&TripCount, BinaryOperator *&Increment,
 // complicated now. It is therefore worth revisiting what the additional
 // benefits are of this (compared to relying on canonical loops and pattern
 // matching).
-static bool verifyTripCount(Value *RHS, Loop *L,
-     SmallPtrSetImpl<Instruction *> &IterationInstructions,
+static bool verifyTripCount(
+    Value *RHS, Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
     PHINode *&InductionPHI, Value *&TripCount, BinaryOperator *&Increment,
-    BranchInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) {
+    CondBrInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) {
   const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
     LLVM_DEBUG(dbgs() << "Backedge-taken count is not predictable\n");
@@ -389,7 +389,7 @@ static bool verifyTripCount(Value *RHS, Loop *L,
 static bool findLoopComponents(
     Loop *L, SmallPtrSetImpl<Instruction *> &IterationInstructions,
     PHINode *&InductionPHI, Value *&TripCount, BinaryOperator *&Increment,
-    BranchInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) {
+    CondBrInst *&BackBranch, ScalarEvolution *SE, bool IsWidened) {
   LLVM_DEBUG(dbgs() << "Finding components of loop: " << L->getName() << "\n");
 
   if (!L->isLoopSimplifyForm()) {
@@ -438,7 +438,7 @@ static bool findLoopComponents(
     LLVM_DEBUG(dbgs() << "Could not find valid comparison\n");
     return false;
   }
-  BackBranch = cast<BranchInst>(Latch->getTerminator());
+  BackBranch = cast<CondBrInst>(Latch->getTerminator());
   IterationInstructions.insert(BackBranch);
   LLVM_DEBUG(dbgs() << "Found back branch: "; BackBranch->dump());
   IterationInstructions.insert(Compare);
@@ -580,9 +580,8 @@ checkOuterLoopInsts(FlattenInfo &FI,
         continue;
       // The unconditional branch to the inner loop's header will turn into
       // a fall-through, so adds no cost.
-      BranchInst *Br = dyn_cast<BranchInst>(&I);
-      if (Br && Br->isUnconditional() &&
-          Br->getSuccessor(0) == FI.InnerLoop->getHeader())
+      UncondBrInst *Br = dyn_cast<UncondBrInst>(&I);
+      if (Br && Br->getSuccessor() == FI.InnerLoop->getHeader())
         continue;
       // Multiplies of the outer iteration variable and inner iteration
       // count will be optimised out.
@@ -785,7 +784,7 @@ static bool DoFlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
   BasicBlock *InnerExitBlock = FI.InnerLoop->getExitBlock();
   BasicBlock *InnerExitingBlock = FI.InnerLoop->getExitingBlock();
   Instruction *Term = InnerExitingBlock->getTerminator();
-  Instruction *BI = BranchInst::Create(InnerExitBlock, InnerExitingBlock);
+  Instruction *BI = UncondBrInst::Create(InnerExitBlock, InnerExitingBlock);
   BI->setDebugLoc(Term->getDebugLoc());
   Term->eraseFromParent();
 
@@ -973,9 +972,7 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
 
     // Check for overflow by calculating the new tripcount using
     // umul_with_overflow and then checking if it overflowed.
-    BranchInst *Br = cast<BranchInst>(CheckBlock->getTerminator());
-    assert(Br->isConditional() &&
-           "Expected LoopVersioning to generate a conditional branch");
+    CondBrInst *Br = cast<CondBrInst>(CheckBlock->getTerminator());
     assert(match(Br->getCondition(), m_Zero()) &&
            "Expected branch condition to be false");
     IRBuilder<> Builder(Br);
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index a5ea40370377d..2c0532ad8c7ea 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -166,7 +166,7 @@ struct FusionCandidate {
   /// Are all of the members of this fusion candidate still valid
   bool Valid;
   /// Guard branch of the loop, if it exists
-  BranchInst *GuardBranch;
+  CondBrInst *GuardBranch;
   /// Peeling Paramaters of the Loop.
   TTI::PeelingPreferences PP;
   /// Can you Peel this Loop?
@@ -275,8 +275,6 @@ struct FusionCandidate {
   /// This method is only valid for guarded loops.
   BasicBlock *getNonLoopBlock() const {
     assert(GuardBranch && "Only valid on guarded loops.");
-    assert(GuardBranch->isConditional() &&
-           "Expecting guard to be a conditional branch.");
     if (Peeled)
       return GuardBranch->getSuccessor(1);
     return (GuardBranch->getSuccessor(0) == Preheader)
@@ -742,7 +740,7 @@ struct LoopFuser {
         BasicBlock *Succ = CurrentBranch->getSuccessor(0);
         if (Succ == BB)
           Succ = CurrentBranch->getSuccessor(1);
-        ReplaceInstWithInst(CurrentBranch, BranchInst::Create(Succ));
+        ReplaceInstWithInst(CurrentBranch, UncondBrInst::Create(Succ));
       }
 
       DTU.applyUpdates(TreeUpdates);
@@ -1473,13 +1471,12 @@ struct LoopFuser {
   /// Modify the latch branch of FC to be unconditional since successors of the
   /// branch are the same.
   void simplifyLatchBranch(const FusionCandidate &FC) const {
-    BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
+    CondBrInst *FCLatchBranch = dyn_cast<CondBrInst>(FC.Latch->getTerminator());
     if (FCLatchBranch) {
-      assert(FCLatchBranch->isConditional() &&
-             FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
+      assert(FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
              "Expecting the two successors of FCLatchBranch to be the same");
-      BranchInst *NewBranch =
-          BranchInst::Create(FCLatchBranch->getSuccessor(0));
+      UncondBrInst *NewBranch =
+          UncondBrInst::Create(FCLatchBranch->getSuccessor(0));
       ReplaceInstWithInst(FCLatchBranch, NewBranch);
     }
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index d6200b1909226..74bb9272080e3 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1581,7 +1581,7 @@ bool LoopIdiomRecognize::optimizeCRCLoop(const PolynomialInfo &Info) {
   {
     unsigned NewBTC = (Info.TripCount / 8) - 1;
     BasicBlock *LoopBlk = CurLoop->getLoopLatch();
-    BranchInst *BrInst = cast<BranchInst>(LoopBlk->getTerminator());
+    CondBrInst *BrInst = cast<CondBrInst>(LoopBlk->getTerminator());
     CmpPredicate ExitPred = BrInst->getSuccessor(0) == LoopBlk
                                 ? ICmpInst::Predicate::ICMP_NE
                                 : ICmpInst::Predicate::ICMP_EQ;
@@ -1707,11 +1707,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
 /// behavior, the variable involved in the comparison is returned. This function
 /// will be called to see if the precondition and postcondition of the loop are
 /// in desirable form.
-static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
+static Value *matchCondition(CondBrInst *BI, BasicBlock *LoopEntry,
                              bool JmpOnZero = false) {
-  if (!BI || !BI->isConditional())
-    return nullptr;
-
   ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
   if (!Cond)
     return nullptr;
@@ -1750,11 +1747,8 @@ class StrlenVerifier {
 
     // It should have a preheader and a branch instruction.
     BasicBlock *Preheader = CurLoop->getLoopPreheader();
-    if (!Preheader)
-      return false;
-
-    BranchInst *EntryBI = dyn_cast<BranchInst>(Preheader->getTerminator());
-    if (!EntryBI)
+    if (!Preheader ||
+        !isa<UncondBrInst, CondBrInst>(Preheader->getTerminator()))
       return false;
 
     // The loop exit must be conditioned on an icmp with 0 the null terminator.
@@ -1766,7 +1760,9 @@ class StrlenVerifier {
     if (!LoopBody || LoopBody->size() >= 15)
       return false;
 
-    BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
+    CondBrInst *LoopTerm = dyn_cast<CondBrInst>(LoopBody->getTerminator());
+    if (!LoopTerm)
+      return false;
     Value *LoopCond = matchCondition(LoopTerm, LoopBody);
     if (!LoopCond)
       return false;
@@ -1923,8 +1919,8 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   BasicBlock *LoopBody = *CurLoop->block_begin();
   BasicBlock *LoopExitBB = CurLoop->getExitBlock();
-  BranchInst *LoopTerm = dyn_cast<BranchInst>(LoopBody->getTerminator());
-  assert(Preheader && LoopBody && LoopExitBB && LoopTerm &&
+  CondBrInst *LoopTerm = cast<CondBrInst>(LoopBody->getTerminator());
+  assert(Preheader && LoopBody && LoopExitBB &&
          "Should be verified to be valid by StrlenVerifier");
 
   if (Verifier.OpWidth == 8) {
@@ -1987,8 +1983,7 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
 
   // LoopDeletion only delete invariant loops with known trip-count. We can
   // update the condition so it will reliablely delete the invariant loop
-  assert(LoopTerm->getNumSuccessors() == 2 &&
-         (LoopTerm->getSuccessor(0) == LoopBody ||
+  assert((LoopTerm->getSuccessor(0) == LoopBody ||
           LoopTerm->getSuccessor(1) == LoopBody) &&
          "loop body must have a successor that is it self");
   ConstantInt *NewLoopCond = LoopTerm->getSuccessor(0) == LoopBody
@@ -2012,11 +2007,8 @@ bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
 /// comparison between a variable and a constant, and if the comparison is false
 /// the control yields to the loop entry. If the branch matches the behaviour,
 /// the variable involved in the comparison is returned.
-static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,
+static Value *matchShiftULTCondition(CondBrInst *BI, BasicBlock *LoopEntry,
                                      APInt &Threshold) {
-  if (!BI || !BI->isConditional())
-    return nullptr;
-
   ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
   if (!Cond)
     return nullptr;
@@ -2087,9 +2079,10 @@ static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,
   LoopEntry = *(CurLoop->block_begin());
 
   // step 1: Check if the loop-back branch is in desirable form.
-  if (Value *T = matchShiftULTCondition(
-          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
-          Threshold))
+  auto *EntryBI = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
+  if (!EntryBI)
+    return false;
+  if (Value *T = matchShiftULTCondition(EntryBI, LoopEntry, Threshold))
     DefX = dyn_cast<Instruction>(T);
   else
     return false;
@@ -2189,11 +2182,10 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
 
   // step 1: Check if the loop-back branch is in desirable form.
   {
-    if (Value *T = matchCondition(
-            dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
-      DefX2 = dyn_cast<Instruction>(T);
-    else
+    auto *LoopTerm = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
+    if (!LoopTerm)
       return false;
+    DefX2 = dyn_cast_or_null<Instruction>(matchCondition(LoopTerm, LoopEntry));
   }
 
   // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
@@ -2265,7 +2257,9 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
   // step 5: check if the precondition is in this form:
   //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
   {
-    auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    auto *PreCondBr = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
+    if (!PreCondBr)
+      return false;
     Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
     if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
       return false;
@@ -2318,11 +2312,10 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
   LoopEntry = *(CurLoop->block_begin());
 
   // step 1: Check if the loop-back branch is in desirable form.
-  if (Value *T = matchCondition(
-          dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
-    DefX = dyn_cast<Instruction>(T);
-  else
+  auto *LoopTerm = dyn_cast<CondBrInst>(LoopEntry->getTerminator());
+  if (!LoopTerm)
     return false;
+  DefX = dyn_cast_or_null<Instruction>(matchCondition(LoopTerm, LoopEntry));
 
   // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
   if (!DefX || !DefX->isShift())
@@ -2438,7 +2431,7 @@ bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
     auto *PreCondBB = PH->getSinglePredecessor();
     if (!PreCondBB)
       return false;
-    auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+    auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
     if (!PreCondBI)
       return false;
     if (matchCondition(PreCondBI, PH) != InitX)
@@ -2520,7 +2513,7 @@ bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
   auto *PreCondBB = PH->getSinglePredecessor();
   if (!PreCondBB)
     return false;
-  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+  auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
   if (!PreCondBI)
     return false;
 
@@ -2577,8 +2570,8 @@ bool LoopIdiomRecognize::recognizePopcount() {
   BasicBlock *PH = CurLoop->getLoopPreheader();
   if (!PH || &PH->front() != PH->getTerminator())
     return false;
-  auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
-  if (!EntryBI || EntryBI->isConditional())
+  auto *EntryBI = dyn_cast<UncondBrInst>(PH->getTerminator());
+  if (!EntryBI)
     return false;
 
   // It should have a precondition block where the generated popcount intrinsic
@@ -2586,8 +2579,8 @@ bool LoopIdiomRecognize::recognizePopcount() {
   auto *PreCondBB = PH->getSinglePredecessor();
   if (!PreCondBB)
     return false;
-  auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
-  if (!PreCondBI || PreCondBI->isUnconditional())
+  auto *PreCondBI = dyn_cast<CondBrInst>(PreCondBB->getTerminator());
+  if (!PreCondBI)
     return false;
 
   Instruction *CntInst;
@@ -2658,10 +2651,8 @@ void LoopIdiomRecognize::transformLoopToCountable(
     Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
     PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
     bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
-  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
-
   // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
-  IRBuilder<> Builder(PreheaderBr);
+  IRBuilder<> Builder(Preheader->getTerminator());
   Builder.SetCurrentDebugLocation(DL);
 
   // If there are no uses of CntPhi crate:
@@ -2717,7 +2708,7 @@ void LoopIdiomRecognize::transformLoopToCountable(
   //   ...
   //   Br: loop if (Dec != 0)
   BasicBlock *Body = *(CurLoop->block_begin());
-  auto *LbBr = cast<BranchInst>(Body->getTerminator());
+  auto *LbBr = cast<CondBrInst>(Body->getTerminator());
   ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
 
   PHINode *TcPhi = PHINode::Create(CountTy, 2, "tcphi");
@@ -2752,7 +2743,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
                                                  Instruction *CntInst,
                                                  PHINode *CntPhi, Value *Var) {
   BasicBlock *PreHead = CurLoop->getLoopPreheader();
-  auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
+  auto *PreCondBr = cast<CondBrInst>(PreCondBB->getTerminator());
   const DebugLoc &DL = CntInst->getDebugLoc();
 
   // Assuming before transformation, the loop is following:
@@ -2823,7 +2814,7 @@ void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
   //     do { cnt++; x &= x-1; t--) } while (t > 0);
   BasicBlock *Body = *(CurLoop->block_begin());
   {
-    auto *LbBr = cast<BranchInst>(Body->getTerminator());
+    auto *LbBr = cast<CondBrInst>(Body->getTerminator());
     ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
     Type *Ty = TripCnt->getType();
 
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 2fd62aae3e0c2..1311acedb6c09 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -816,12 +816,7 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
   // A perfectly nested loop will not have any branch in between the outer and
   // inner block i.e. outer header will branch to either inner preheader and
   // outerloop latch.
-  BranchInst *OuterLoopHeaderBI =
-      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
-  if (!OuterLoopHeaderBI)
-    return false;
-
-  for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
+  for (BasicBlock *Succ : successors(OuterLoopHeader))
     if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
         Succ != OuterLoopLatch)
       return false;
@@ -901,9 +896,9 @@ bool LoopInterchangeLegality::isLoopStructureUnderstood() {
   //      for(int i=0;i<N;i++)
   //        for(int j=0;j*i<N;j++)
   BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
-  BranchInst *InnerLoopLatchBI =
-      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
-  if (!InnerLoopLatchBI->isConditional())
+  CondBrInst *InnerLoopLatchBI =
+      dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
+  if (!InnerLoopLatchBI)
     return false;
   if (CmpInst *InnerLoopCmp =
           dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
@@ -1256,8 +1251,8 @@ bool LoopInterchangeLegality::currentLimitations() {
   // blocks.
   if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
       OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
-      !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
-      !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
+      !isa<CondBrInst>(InnerLoopLatch->getTerminator()) ||
+      !isa<CondBrInst>(OuterLoop->getLoopLatch()->getTerminator())) {
     LLVM_DEBUG(
         dbgs() << "Loops where the latch is not the exiting block are not"
                << " supported currently.\n");
@@ -1999,7 +1994,7 @@ bool LoopInterchangeTransform::transform(
 
     // FIXME: Should we interchange when we have a constant condition?
     Instruction *CondI = dyn_cast<Instruction>(
-        cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
+        cast<CondBrInst>(InnerLoop->getLoopLatch()->getTerminator())
             ->getCondition());
     if (CondI)
       WorkList.insert(CondI);
@@ -2076,14 +2071,14 @@ static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
 // Update BI to jump to NewBB instead of OldBB. Records updates to the
 // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
 // \p OldBB  is exactly once in BI's successor list.
-static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
+static void updateSuccessor(Instruction *Term, BasicBlock *OldBB,
                             BasicBlock *NewBB,
                             std::vector<DominatorTree::UpdateType> &DTUpdates,
                             bool MustUpdateOnce = true) {
-  assert((!MustUpdateOnce || llvm::count(successors(BI), OldBB) == 1) &&
+  assert((!MustUpdateOnce || llvm::count(successors(Term), OldBB) == 1) &&
          "BI must jump to OldBB exactly once.");
   bool Changed = false;
-  for (Use &Op : BI->operands())
+  for (Use &Op : Term->operands())
     if (Op == OldBB) {
       Op.set(NewBB);
       Changed = true;
@@ -2091,9 +2086,9 @@ static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
 
   if (Changed) {
     DTUpdates.push_back(
-        {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
+        {DominatorTree::UpdateKind::Insert, Term->getParent(), NewBB});
     DTUpdates.push_back(
-        {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
+        {DominatorTree::UpdateKind::Delete, Term->getParent(), OldBB});
   }
   assert(Changed && "Expected a successor to be updated");
 }
@@ -2274,24 +2269,21 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   BasicBlock *InnerLoopLatchSuccessor;
   BasicBlock *OuterLoopLatchSuccessor;
 
-  BranchInst *OuterLoopLatchBI =
-      dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
-  BranchInst *InnerLoopLatchBI =
-      dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
-  BranchInst *OuterLoopHeaderBI =
-      dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
-  BranchInst *InnerLoopHeaderBI =
-      dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+  CondBrInst *OuterLoopLatchBI =
+      dyn_cast<CondBrInst>(OuterLoopLatch->getTerminator());
+  CondBrInst *InnerLoopLatchBI =
+      dyn_cast<CondBrInst>(InnerLoopLatch->getTerminator());
+  Instruction *OuterLoopHeaderBI = OuterLoopHeader->getTerminator();
+  Instruction *InnerLoopHeaderBI = InnerLoopHeader->getTerminator();
 
   if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
       !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
       !InnerLoopHeaderBI)
     return false;
 
-  BranchInst *InnerLoopLatchPredecessorBI =
-      dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
-  BranchInst *OuterLoopPredecessorBI =
-      dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+  Instruction *InnerLoopLatchPredecessorBI =
+      InnerLoopLatchPredecessor->getTerminator();
+  Instruction *OuterLoopPredecessorBI = OuterLoopPredecessor->getTerminator();
 
   if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
     return false;
@@ -2307,7 +2299,7 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
                   InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
   // The outer loop header might or might not branch to the outer latch.
   // We are guaranteed to branch to the inner loop preheader.
-  if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
+  if (llvm::is_contained(successors(OuterLoopHeaderBI), OuterLoopLatch)) {
     // In this case the outerLoopHeader should branch to the InnerLoopLatch.
     updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
                     DTUpdates,
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index b14900d6dd635..de5365271e233 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -310,7 +310,8 @@ class LoopPredication {
                    SmallVectorImpl<Value *> &WidenedChecks,
                    SCEVExpander &Expander, Instruction *Guard);
   bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
-  bool widenWidenableBranchGuardConditions(BranchInst *Guard, SCEVExpander &Expander);
+  bool widenWidenableBranchGuardConditions(CondBrInst *Guard,
+                                           SCEVExpander &Expander);
   // If the loop always exits through another block in the loop, we should not
   // predicate based on the latch check. For example, the latch check can be a
   // very coarse grained check and there can be more fine grained exit checks
@@ -755,7 +756,7 @@ bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
 }
 
 bool LoopPredication::widenWidenableBranchGuardConditions(
-    BranchInst *BI, SCEVExpander &Expander) {
+    CondBrInst *BI, SCEVExpander &Expander) {
   assert(isGuardAsWidenableBranch(BI) && "Must be!");
   LLVM_DEBUG(dbgs() << "Processing guard:\n");
   LLVM_DEBUG(BI->dump());
@@ -813,8 +814,8 @@ std::optional<LoopICmp> LoopPredication::parseLoopLatchICmp() {
     return std::nullopt;
   }
 
-  auto *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
-  if (!BI || !BI->isConditional()) {
+  auto *BI = dyn_cast<CondBrInst>(LoopLatch->getTerminator());
+  if (!BI) {
     LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n");
     return std::nullopt;
   }
@@ -970,7 +971,7 @@ bool LoopPredication::isLoopProfitableToPredicate() {
 
 /// If we can (cheaply) find a widenable branch which controls entry into the
 /// loop, return it.
-static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) {
+static CondBrInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) {
   // Walk back through any unconditional executed blocks and see if we can find
   // a widenable condition which seems to control execution of this loop.  Note
   // that we predict that maythrow calls are likely untaken and thus that it's
@@ -990,7 +991,7 @@ static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) {
   } while (true);
 
   if (BasicBlock *Pred = BB->getSinglePredecessor()) {
-    if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
+    if (auto *BI = dyn_cast<CondBrInst>(Pred->getTerminator()))
       if (BI->getSuccessor(0) == BB && isWidenableBranch(BI))
         return BI;
   }
@@ -1076,7 +1077,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
     if (LI->getLoopFor(ExitingBB) != L)
       continue;
 
-    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    auto *BI = dyn_cast<CondBrInst>(ExitingBB->getTerminator());
     if (!BI)
       continue;
 
@@ -1121,7 +1122,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
       continue;
 
     // Can't rewrite non-branch yet.
-    auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+    auto *BI = dyn_cast<CondBrInst>(ExitingBB->getTerminator());
     if (!BI)
       continue;
 
@@ -1222,7 +1223,7 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
   // Collect all the guards into a vector and process later, so as not
   // to invalidate the instruction iterator.
   SmallVector<IntrinsicInst *, 4> Guards;
-  SmallVector<BranchInst *, 4> GuardsAsWidenableBranches;
+  SmallVector<CondBrInst *, 4> GuardsAsWidenableBranches;
   for (const auto BB : L->blocks()) {
     for (auto &I : *BB)
       if (isGuard(&I))
@@ -1230,7 +1231,7 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
     if (PredicateWidenableBranchGuards &&
         isGuardAsWidenableBranch(BB->getTerminator()))
       GuardsAsWidenableBranches.push_back(
-          cast<BranchInst>(BB->getTerminator()));
+          cast<CondBrInst>(BB->getTerminator()));
   }
 
   SCEVExpander Expander(*SE, "loop-predication");
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index e902b71776973..328d842243f19 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -50,9 +50,7 @@ STATISTIC(NumLoopExitsDeleted,
 /// return nullptr.
 static BasicBlock *getOnlyLiveSuccessor(BasicBlock *BB) {
   Instruction *TI = BB->getTerminator();
-  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-    if (BI->isUnconditional())
-      return nullptr;
+  if (CondBrInst *BI = dyn_cast<CondBrInst>(TI)) {
     if (BI->getSuccessor(0) == BI->getSuccessor(1))
       return BI->getSuccessor(0);
     ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 337500aec1dcf..3a60d6538d6a7 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1325,12 +1325,6 @@ class LSRUse {
   /// changing the formula.
   bool RigidFormula = false;
 
-  /// This records the widest use type for any fixup using this
-  /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
-  /// fixup widths to be equivalent, because the narrower one may be relying on
-  /// the implicit truncation to truncate away bogus bits.
-  Type *WidestFixupType = nullptr;
-
   /// A list of ways to build a value that can satisfy this user.  After the
   /// list is populated, one of these is selected heuristically and used to
   /// formulate a replacement for OperandValToReplace in UserInst.
@@ -1791,9 +1785,6 @@ void LSRUse::print(raw_ostream &OS) const {
 
   if (AllFixupsUnconditional)
     OS << ", all-fixups-unconditional";
-
-  if (WidestFixupType)
-    OS << ", widest fixup type: " << *WidestFixupType;
 }
 
 LLVM_DUMP_METHOD void LSRUse::dump() const {
@@ -2631,8 +2622,8 @@ LSRInstance::OptimizeLoopTermCond() {
     // induction variable, to allow coalescing the live ranges for the IV into
     // one register value.
 
-    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-    if (!TermBr || TermBr->isUnconditional())
+    CondBrInst *TermBr = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
+    if (!TermBr)
       continue;
 
     Instruction *Cond = dyn_cast<Instruction>(TermBr->getCondition());
@@ -2872,10 +2863,8 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
     // Ignore ICmpZero uses because they may contain formulae generated by
     // GenerateICmpZeroScales, in which case adding fixup offsets may
     // be invalid.
-    if (&LU != &OrigLU &&
-        LU.Kind != LSRUse::ICmpZero &&
+    if (&LU != &OrigLU && LU.Kind != LSRUse::ICmpZero &&
         LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
-        LU.WidestFixupType == OrigLU.WidestFixupType &&
         LU.HasFormulaWithSameRegs(OrigF)) {
       // Scan through this use's formulae.
       for (const Formula &F : LU.Formulae) {
@@ -3641,11 +3630,6 @@ void LSRInstance::CollectFixupsAndInitialFormulae() {
       VisitedLSRUse.insert(LUIdx);
     }
 
-    if (!LU.WidestFixupType ||
-        SE.getTypeSizeInBits(LU.WidestFixupType) <
-        SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
-      LU.WidestFixupType = LF.OperandValToReplace->getType();
-
     // If this is the first use of this LSRUse, give it a formula.
     if (LU.Formulae.empty()) {
       InsertInitialFormula(S, LU, LUIdx);
@@ -3836,10 +3820,6 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
         LF.Offset = Offset;
         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
         LU.AllFixupsUnconditional &= IsFixupExecutedEachIncrement(LF);
-        if (!LU.WidestFixupType ||
-            SE.getTypeSizeInBits(LU.WidestFixupType) <
-            SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
-          LU.WidestFixupType = LF.OperandValToReplace->getType();
         InsertSupplementalFormula(US, LU, LUIdx);
         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
         break;
@@ -4961,6 +4941,8 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
     for (const Formula &F : LU.Formulae) {
       if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1))
         continue;
+      assert((LU.Kind == LSRUse::Address || LU.Kind == LSRUse::ICmpZero) &&
+             "Only address and cmp uses expected to have nonzero BaseOffset");
 
       LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
       if (!LUThatHas)
@@ -4982,6 +4964,13 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
         LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
       }
 
+#ifndef NDEBUG
+      Type *FixupType = LUThatHas->Fixups[0].OperandValToReplace->getType();
+      for (LSRFixup &Fixup : LUThatHas->Fixups)
+        assert(Fixup.OperandValToReplace->getType() == FixupType &&
+               "Expected all fixups to have the same type");
+#endif
+
       // Delete formulae from the new use which are no longer legal.
       bool Any = false;
       for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
diff --git a/llvm/lib/Transforms/Scalar/LoopTermFold.cpp b/llvm/lib/Transforms/Scalar/LoopTermFold.cpp
index 26964ab1e8745..e6eccdd7d616e 100644
--- a/llvm/lib/Transforms/Scalar/LoopTermFold.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopTermFold.cpp
@@ -67,8 +67,8 @@ canFoldTermCondOfLoop(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   }
 
   BasicBlock *LoopLatch = L->getLoopLatch();
-  BranchInst *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
-  if (!BI || BI->isUnconditional())
+  CondBrInst *BI = dyn_cast<CondBrInst>(LoopLatch->getTerminator());
+  if (!BI)
     return std::nullopt;
   auto *TermCond = dyn_cast<ICmpInst>(BI->getCondition());
   if (!TermCond) {
@@ -274,7 +274,7 @@ static bool RunTermFold(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
                     << *TermValue << "\n");
 
   // Create new terminating condition at loop latch
-  BranchInst *BI = cast<BranchInst>(LoopLatch->getTerminator());
+  CondBrInst *BI = cast<CondBrInst>(LoopLatch->getTerminator());
   ICmpInst *OldTermCond = cast<ICmpInst>(BI->getCondition());
   IRBuilder<> LatchBuilder(LoopLatch->getTerminator());
   Value *NewTermCond =
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index cdd99fad10afa..5142f90ac7079 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -609,16 +609,14 @@ static std::optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
       // Add in the live successors by first checking whether we have terminator
       // that may be simplified based on the values simplified by this call.
       BasicBlock *KnownSucc = nullptr;
-      if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-        if (BI->isConditional()) {
-          if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
-            // Just take the first successor if condition is undef
-            if (isa<UndefValue>(SimpleCond))
-              KnownSucc = BI->getSuccessor(0);
-            else if (ConstantInt *SimpleCondVal =
-                         dyn_cast<ConstantInt>(SimpleCond))
-              KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
-          }
+      if (CondBrInst *BI = dyn_cast<CondBrInst>(TI)) {
+        if (auto *SimpleCond = getSimplifiedConstant(BI->getCondition())) {
+          // Just take the first successor if condition is undef
+          if (isa<UndefValue>(SimpleCond))
+            KnownSucc = BI->getSuccessor(0);
+          else if (ConstantInt *SimpleCondVal =
+                       dyn_cast<ConstantInt>(SimpleCond))
+            KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
         }
       } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
         if (auto *SimpleCond = getSimplifiedConstant(SI->getCondition())) {
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index dcbec927e5570..0c94ed27880f0 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -64,11 +64,9 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
                                   UnsimplifiedUsers.end());
 
   for (auto &VH : Worklist) {
-    BranchInst *BI = dyn_cast_or_null<BranchInst>(VH);
+    CondBrInst *BI = dyn_cast_or_null<CondBrInst>(VH);
     if (!BI)
       continue;
-    if (BI->isUnconditional())
-      continue;
 
     BasicBlock *Target, *Other;
     if (match(BI->getOperand(0), m_Zero())) {
@@ -85,7 +83,7 @@ static bool replaceConditionalBranchesOnConstant(Instruction *II,
       BasicBlock *Source = BI->getParent();
       Other->removePredecessor(Source);
 
-      Instruction *NewBI = BranchInst::Create(Target, Source);
+      Instruction *NewBI = UncondBrInst::Create(Target, Source);
       NewBI->setDebugLoc(BI->getDebugLoc());
       BI->eraseFromParent();
 
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 17c5a4ee1fd0b..ed6a6aa52a23d 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -193,18 +193,14 @@ static void handlePhiDef(CallInst *Expect) {
 
   // Get the first dominating conditional branch of the operand
   // i's incoming block.
-  auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+  auto GetDomConditional = [&](unsigned i) -> CondBrInst * {
     BasicBlock *BB = PhiDef->getIncomingBlock(i);
-    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (BI && BI->isConditional())
+    if (CondBrInst *BI = dyn_cast<CondBrInst>(BB->getTerminator()))
       return BI;
     BB = BB->getSinglePredecessor();
     if (!BB)
       return nullptr;
-    BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BI || BI->isUnconditional())
-      return nullptr;
-    return BI;
+    return dyn_cast<CondBrInst>(BB->getTerminator());
   };
 
   // Now walk through all Phi operands to find phi oprerands with values
@@ -226,7 +222,7 @@ static void handlePhiDef(CallInst *Expect) {
     if (ExpectedValueIsLikely == (ExpectedPhiValue == CurrentPhiValue))
       continue;
 
-    BranchInst *BI = GetDomConditional(i);
+    CondBrInst *BI = GetDomConditional(i);
     if (!BI)
       continue;
 
@@ -272,7 +268,7 @@ static void handlePhiDef(CallInst *Expect) {
   }
 }
 
-// Handle both BranchInst and SelectInst.
+// Handle both CondBrInst and SelectInst.
 template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
 
   // Handle non-optimized IR code like:
@@ -354,20 +350,13 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   return true;
 }
 
-static bool handleBranchExpect(BranchInst &BI) {
-  if (BI.isUnconditional())
-    return false;
-
-  return handleBrSelExpect<BranchInst>(BI);
-}
-
 static bool lowerExpectIntrinsic(Function &F) {
   bool Changed = false;
 
   for (BasicBlock &BB : F) {
     // Create "block_weights" metadata.
-    if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
-      if (handleBranchExpect(*BI))
+    if (CondBrInst *BI = dyn_cast<CondBrInst>(BB.getTerminator())) {
+      if (handleBrSelExpect<CondBrInst>(*BI))
         ExpectIntrinsicsHandled++;
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
       if (handleSwitchExpect(*SI))
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index eaa97909695c7..0828ffe82af48 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1917,7 +1917,7 @@ class LowerMatrixIntrinsics {
         "store.end",
         GEPNoWrapFlags::inBounds() | GEPNoWrapFlags::noUnsignedWrap());
     Value *LoadBegin = Load->getPointerOperand();
-    BranchInst *BR1 = Builder.CreateCondBr(
+    CondBrInst *BR1 = Builder.CreateCondBr(
         Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1, Fusion);
     setExplicitlyUnknownBranchWeightsIfProfiled(*BR1, DEBUG_TYPE);
 
@@ -1930,7 +1930,7 @@ class LowerMatrixIntrinsics {
         LoadBegin, ConstantInt::get(AddrTy, LoadLoc.Size.getValue()),
         "load.end",
         GEPNoWrapFlags::inBounds() | GEPNoWrapFlags::noUnsignedWrap());
-    BranchInst *BR2 = Builder.CreateCondBr(
+    CondBrInst *BR2 = Builder.CreateCondBr(
         Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy, Fusion);
     setExplicitlyUnknownBranchWeightsIfProfiled(*BR2, DEBUG_TYPE);
 
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index d4358c1a6a599..531d370b6d678 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -344,20 +344,17 @@ visitCmpBlock(Value *const Val, BasicBlock *const Block,
               const BasicBlock *const PhiBlock, BaseIdentifier &BaseId) {
   if (Block->empty())
     return std::nullopt;
-  auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
-  if (!BranchI)
-    return std::nullopt;
-  LLVM_DEBUG(dbgs() << "branch\n");
+  auto *Term = Block->getTerminator();
   Value *Cond;
   ICmpInst::Predicate ExpectedPredicate;
-  if (BranchI->isUnconditional()) {
+  if (isa<UncondBrInst>(Term)) {
     // In this case, we expect an incoming value which is the result of the
     // comparison. This is the last link in the chain of comparisons (note
     // that this does not mean that this is the last incoming value, blocks
     // can be reordered).
     Cond = Val;
     ExpectedPredicate = ICmpInst::ICMP_EQ;
-  } else {
+  } else if (auto *BranchI = dyn_cast<CondBrInst>(Term)) {
     // In this case, we expect a constant incoming value (the comparison is
     // chained).
     const auto *const Const = cast<ConstantInt>(Val);
@@ -370,7 +367,8 @@ visitCmpBlock(Value *const Val, BasicBlock *const Block,
     Cond = BranchI->getCondition();
     ExpectedPredicate =
         FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
-  }
+  } else
+    return std::nullopt;
 
   auto *CmpI = dyn_cast<ICmpInst>(Cond);
   if (!CmpI)
@@ -382,7 +380,7 @@ visitCmpBlock(Value *const Val, BasicBlock *const Block,
     return std::nullopt;
 
   BCECmpBlock::InstructionSet BlockInsts(
-      {Result->Lhs.LoadI, Result->Rhs.LoadI, Result->CmpI, BranchI});
+      {Result->Lhs.LoadI, Result->Rhs.LoadI, Result->CmpI, Term});
   if (Result->Lhs.GEP)
     BlockInsts.insert(Result->Lhs.GEP);
   if (Result->Rhs.GEP)
diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 60fbb689c33f3..fa86ad1f522b7 100644
--- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -137,8 +137,8 @@ BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
 bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
   if (!BB)
     return false;
-  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
-  if (!BI || !BI->isConditional())
+  auto *BI = dyn_cast<CondBrInst>(BB->getTerminator());
+  if (!BI)
     return false;
 
   BasicBlock *Succ0 = BI->getSuccessor(0);
diff --git a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index bcedc1ab2a3ca..f0426e916fb18 100644
--- a/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -70,9 +70,9 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
       Builder.getTrue(), Call->getNextNode(), /*Unreachable=*/false,
       /*BranchWeights*/ nullptr, DTU);
 
-  auto *CurrBBTerm = cast<BranchInst>(CurrBB.getTerminator());
+  auto *CurrBBTerm = cast<CondBrInst>(CurrBB.getTerminator());
   // We want an 'else' block though, not a 'then' block.
-  cast<BranchInst>(CurrBBTerm)->swapSuccessors();
+  CurrBBTerm->swapSuccessors();
 
   // Create phi that will merge results of either sqrt and replace all uses.
   BasicBlock *JoinBB = LibCallTerm->getSuccessor(0);
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 3726f8dff9f17..ea6c394740f22 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -1626,7 +1626,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
         if (CF->isNegative()) {
           APFloat F(CF->getValueAPF());
           F.changeSign();
-          Factor = ConstantFP::get(CF->getContext(), F);
+          Factor = ConstantFP::get(CF->getType(), F);
           if (!Duplicates.insert(Factor).second)
             continue;
           unsigned Occ = ++FactorOccurrences[Factor];
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 230df9f971a71..c0977ae1b4c6b 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -3115,9 +3115,8 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
   // as long as all statepoints are in rare blocks.  If we had in-register
   // lowering for live values this would be a much safer transform.
   auto getConditionInst = [](Instruction *TI) -> Instruction * {
-    if (auto *BI = dyn_cast<BranchInst>(TI))
-      if (BI->isConditional())
-        return dyn_cast<Instruction>(BI->getCondition());
+    if (auto *BI = dyn_cast<CondBrInst>(TI))
+      return dyn_cast<Instruction>(BI->getCondition());
     // TODO: Extend this to handle switches
     return nullptr;
   };
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index f0a1aa3367f5b..760b84000fe7b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -1861,9 +1861,9 @@ static void rewriteMemOpOfSelect(SelectInst &SI, T &I,
                               SI.getMetadata(LLVMContext::MD_prof), &DTU,
                               /*LI=*/nullptr, /*ThenBlock=*/nullptr);
     if (Spec.isSpeculatable(/*isTrueVal=*/true))
-      cast<BranchInst>(Head->getTerminator())->swapSuccessors();
+      cast<CondBrInst>(Head->getTerminator())->swapSuccessors();
   }
-  auto *HeadBI = cast<BranchInst>(Head->getTerminator());
+  auto *HeadBI = cast<CondBrInst>(Head->getTerminator());
   Spec = {}; // Do not use `Spec` beyond this point.
   BasicBlock *Tail = I.getParent();
   Tail->setName(Head->getName() + ".cont");
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index bc1f352186e13..5a38f11314b13 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -542,7 +542,7 @@ static bool canReorderAddSextToGEP(const GetElementPtrInst *GEP,
   } else {
     // (2^(N-1) + C) * stride
     Threshold = (APInt::getSignedMinValue(N).zext(128) +
-                 CI->getValue().zextOrTrunc(128)) *
+                 CI->getValue().sextOrTrunc(128)) *
                 APInt(128, Stride);
   }
 
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 531922babc50a..40a3a23814700 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -147,11 +147,11 @@ extern cl::opt<bool> ProfcheckDisableMetadataFixes;
 AnalysisKey ShouldRunExtraSimpleLoopUnswitch::Key;
 namespace {
 struct CompareDesc {
-  BranchInst *Term;
+  CondBrInst *Term;
   Value *Invariant;
   BasicBlock *InLoopSucc;
 
-  CompareDesc(BranchInst *Term, Value *Invariant, BasicBlock *InLoopSucc)
+  CompareDesc(CondBrInst *Term, Value *Invariant, BasicBlock *InLoopSucc)
       : Term(Term), Invariant(Invariant), InLoopSucc(InLoopSucc) {}
 };
 
@@ -290,7 +290,7 @@ static void buildPartialUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> Invariants, bool Direction,
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze,
     const Instruction *I, AssumptionCache *AC, const DominatorTree &DT,
-    const BranchInst &ComputeProfFrom) {
+    const CondBrInst &ComputeProfFrom) {
 
   SmallVector<uint32_t> BranchWeights;
   bool HasBranchWeights = EstimateProfile && !ProfcheckDisableMetadataFixes &&
@@ -337,7 +337,7 @@ static void buildPartialUnswitchConditionalBranch(
 static void buildPartialInvariantUnswitchConditionalBranch(
     BasicBlock &BB, ArrayRef<Value *> ToDuplicate, bool Direction,
     BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, Loop &L,
-    MemorySSAUpdater *MSSAU, const BranchInst &OriginalBranch) {
+    MemorySSAUpdater *MSSAU, const CondBrInst &OriginalBranch) {
   ValueToValueMapTy VMap;
   for (auto *Val : reverse(ToDuplicate)) {
     Instruction *Inst = cast<Instruction>(Val);
@@ -566,10 +566,9 @@ static Loop *getTopMostExitingLoop(const BasicBlock *ExitBB,
 ///
 /// If `SE` is not null, it will be updated based on the potential loop SCEVs
 /// invalidated by this.
-static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
+static bool unswitchTrivialBranch(Loop &L, CondBrInst &BI, DominatorTree &DT,
                                   LoopInfo &LI, ScalarEvolution *SE,
                                   MemorySSAUpdater *MSSAU) {
-  assert(BI.isConditional() && "Can only unswitch a conditional branch!");
   LLVM_DEBUG(dbgs() << "  Trying to unswitch branch: " << BI << "\n");
 
   // The loop invariant values that we want to unswitch.
@@ -692,7 +691,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
     } else {
       // Create a new unconditional branch that will continue the loop as a new
       // terminator.
-      Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB);
+      Instruction *NewBI = UncondBrInst::Create(ContinueBB, ParentBB);
       NewBI->setDebugLoc(BI.getDebugLoc());
     }
     BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
@@ -730,7 +729,7 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
       Instruction *Term = ParentBB->getTerminator();
       // Remove the cloned branch instruction and create unconditional branch
       // now.
-      Instruction *NewBI = BranchInst::Create(ContinueBB, ParentBB);
+      Instruction *NewBI = UncondBrInst::Create(ContinueBB, ParentBB);
       NewBI->setDebugLoc(Term->getDebugLoc());
       Term->eraseFromParent();
       MSSAU->removeEdge(ParentBB, LoopExitBB);
@@ -1037,7 +1036,7 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
                                       /*KeepOneInputPHIs*/ true);
     }
     // Now nuke the switch and replace it with a direct branch.
-    Instruction *NewBI = BranchInst::Create(CommonSuccBB, BB);
+    Instruction *NewBI = UncondBrInst::Create(CommonSuccBB, BB);
     NewBI->setDebugLoc(SIW->getDebugLoc());
     SIW.eraseFromParent();
   } else if (DefaultExitBB) {
@@ -1155,15 +1154,15 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
       // we can continue. The unswitching logic specifically works to fold any
       // cases it can into an unconditional branch to make it easier to
       // recognize here.
-      auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
-      if (!BI || BI->isConditional())
+      auto *BI = dyn_cast<UncondBrInst>(CurrentBB->getTerminator());
+      if (!BI)
         return Changed;
 
-      CurrentBB = BI->getSuccessor(0);
+      CurrentBB = BI->getSuccessor();
       continue;
     }
 
-    auto *BI = dyn_cast<BranchInst>(CurrentTerm);
+    auto *BI = dyn_cast<CondBrInst>(CurrentTerm);
     if (!BI)
       // We do not understand other terminator instructions.
       return Changed;
@@ -1171,8 +1170,7 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
     // Don't bother trying to unswitch past an unconditional branch or a branch
     // with a constant value. These should be removed by simplifycfg prior to
     // running this pass.
-    if (!BI->isConditional() ||
-        isa<Constant>(skipTrivialSelect(BI->getCondition())))
+    if (isa<Constant>(skipTrivialSelect(BI->getCondition())))
       return Changed;
 
     // Found a trivial condition candidate: non-foldable conditional branch. If
@@ -1185,12 +1183,11 @@ static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
 
     // If we only unswitched some of the conditions feeding the branch, we won't
     // have collapsed it to a single successor.
-    BI = cast<BranchInst>(CurrentBB->getTerminator());
-    if (BI->isConditional())
+    if (isa<CondBrInst>(CurrentBB->getTerminator()))
       return Changed;
 
     // Follow the newly unconditional branch into its successor.
-    CurrentBB = BI->getSuccessor(0);
+    CurrentBB = cast<UncondBrInst>(CurrentBB->getTerminator())->getSuccessor();
 
     // When continuing, if we exit the loop or reach a previous visited block,
     // then we can not reach any trivial condition candidates (unfoldable
@@ -1371,12 +1368,12 @@ static BasicBlock *buildClonedLoopBlocks(
   // Trivial Simplification. If Terminator is a conditional branch and
   // condition becomes dead - erase it.
   Value *ClonedConditionToErase = nullptr;
-  if (auto *BI = dyn_cast<BranchInst>(ClonedTerminator))
+  if (auto *BI = dyn_cast<CondBrInst>(ClonedTerminator))
     ClonedConditionToErase = BI->getCondition();
   else if (auto *SI = dyn_cast<SwitchInst>(ClonedTerminator))
     ClonedConditionToErase = SI->getCondition();
 
-  Instruction *BI = BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+  Instruction *BI = UncondBrInst::Create(ClonedSuccBB, ClonedParentBB);
   BI->setDebugLoc(ClonedTerminator->getDebugLoc());
   ClonedTerminator->eraseFromParent();
 
@@ -2238,7 +2235,7 @@ static void unswitchNontrivialInvariants(
     AssumptionCache &AC, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
     LPMUpdater &LoopUpdater, bool InsertFreeze, bool InjectedCondition) {
   auto *ParentBB = TI.getParent();
-  BranchInst *BI = dyn_cast<BranchInst>(&TI);
+  CondBrInst *BI = dyn_cast<CondBrInst>(&TI);
   SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
 
   // Save the current loop name in a variable so that we can report it even
@@ -2248,8 +2245,7 @@ static void unswitchNontrivialInvariants(
   // We can only unswitch switches, conditional branches with an invariant
   // condition, or combining invariant conditions with an instruction or
   // partially invariant instructions.
-  assert((SI || (BI && BI->isConditional())) &&
-         "Can only unswitch switches and conditional branch!");
+  assert((SI || BI) && "Can only unswitch switches and conditional branch!");
   bool PartiallyInvariant = !PartialIVInfo.InstToDuplicate.empty();
   bool FullUnswitch =
       SI || (skipTrivialSelect(BI->getCondition()) == Invariants[0] &&
@@ -2422,7 +2418,7 @@ static void unswitchNontrivialInvariants(
       Value *Cond = skipTrivialSelect(BI->getCondition());
       if (InsertFreeze) {
         // We don't give any debug location to the new freeze, because the
-        // BI (`dyn_cast<BranchInst>(TI)`) is an in-loop instruction hoisted
+        // BI (`dyn_cast<CondBrInst>(TI)`) is an in-loop instruction hoisted
         // out of the loop.
         Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator());
         cast<Instruction>(Cond)->setDebugLoc(DebugLoc::getDropped());
@@ -2511,7 +2507,7 @@ static void unswitchNontrivialInvariants(
 
     // Create a new unconditional branch to the continuing block (as opposed to
     // the one cloned).
-    Instruction *NewBI = BranchInst::Create(RetainedSuccBB, ParentBB);
+    Instruction *NewBI = UncondBrInst::Create(RetainedSuccBB, ParentBB);
     NewBI->setDebugLoc(NewTI->getDebugLoc());
 
     // After MSSAU update, remove the cloned terminator instruction NewTI.
@@ -2771,7 +2767,7 @@ static InstructionCost computeDomSubtreeCost(
 ///
 /// It also makes all relevant DT and LI updates, so that all structures are in
 /// valid state after this transform.
-static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT,
+static CondBrInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT,
                                         LoopInfo &LI, MemorySSAUpdater *MSSAU,
                                         AssumptionCache *AC) {
   LLVM_DEBUG(dbgs() << "Turning " << *SI << " into a branch.\n");
@@ -2780,7 +2776,7 @@ static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT,
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   SplitBlockAndInsertIfThen(SI->getCondition(), SI, false,
                             SI->getMetadata(LLVMContext::MD_prof), &DTU, &LI);
-  auto *CondBr = cast<BranchInst>(HeadBB->getTerminator());
+  auto *CondBr = cast<CondBrInst>(HeadBB->getTerminator());
   BasicBlock *ThenBB = CondBr->getSuccessor(0),
              *TailBB = CondBr->getSuccessor(1);
   if (MSSAU)
@@ -2822,7 +2818,7 @@ static BranchInst *turnSelectIntoBranch(SelectInst *SI, DominatorTree &DT,
 ///
 /// It also makes all relevant DT and LI updates, so that all structures are in
 /// valid state after this transform.
-static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
+static CondBrInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
                                        DominatorTree &DT, LoopInfo &LI,
                                        MemorySSAUpdater *MSSAU) {
   LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
@@ -2840,7 +2836,7 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
           ? MDBuilder(GI->getContext()).createUnlikelyBranchWeights()
           : nullptr,
       &DTU, &LI);
-  BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+  CondBrInst *CheckBI = cast<CondBrInst>(CheckBB->getTerminator());
   // SplitBlockAndInsertIfThen inserts control flow that branches to
   // DeoptBlockTerm if the condition is true.  We want the opposite.
   CheckBI->swapSuccessors();
@@ -3036,9 +3032,8 @@ static bool collectUnswitchCandidates(
       continue;
     }
 
-    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BI || !BI->isConditional() ||
-        BI->getSuccessor(0) == BI->getSuccessor(1))
+    auto *BI = dyn_cast<CondBrInst>(BB->getTerminator());
+    if (!BI || BI->getSuccessor(0) == BI->getSuccessor(1))
       continue;
 
     AddUnswitchCandidatesForInst(BI, BI->getCondition());
@@ -3121,7 +3116,7 @@ static bool shouldTryInjectInvariantCondition(
 /// TakenSucc via injection of invariant conditions. The branch should be not
 /// enough and not previously unswitched, the information about this comes from
 /// the metadata.
-bool shouldTryInjectBasingOnMetadata(const BranchInst *BI,
+bool shouldTryInjectBasingOnMetadata(const CondBrInst *BI,
                                      const BasicBlock *TakenSucc) {
   SmallVector<uint32_t> Weights;
   if (!extractBranchWeights(*BI, Weights))
@@ -3174,7 +3169,7 @@ injectPendingInvariantConditions(NonTrivialUnswitchCandidate Candidate, Loop &L,
   auto *LHS = Candidate.PendingInjection->LHS;
   auto *RHS = Candidate.PendingInjection->RHS;
   auto *InLoopSucc = Candidate.PendingInjection->InLoopSucc;
-  auto *TI = cast<BranchInst>(Candidate.TI);
+  auto *TI = cast<CondBrInst>(Candidate.TI);
   auto *BB = Candidate.TI->getParent();
   auto *OutOfLoopSucc = InLoopSucc == TI->getSuccessor(0) ? TI->getSuccessor(1)
                                                           : TI->getSuccessor(0);
@@ -3339,11 +3334,11 @@ static bool collectUnswitchCandidatesWithInjections(
                                                L);
     if (!shouldTryInjectInvariantCondition(Pred, LHS, RHS, IfTrue, IfFalse, L))
       continue;
-    if (!shouldTryInjectBasingOnMetadata(cast<BranchInst>(Term), IfTrue))
+    if (!shouldTryInjectBasingOnMetadata(cast<CondBrInst>(Term), IfTrue))
       continue;
     // Strip ZEXT for unsigned predicate.
     // TODO: once signed predicates are supported, also strip SEXT.
-    CompareDesc Desc(cast<BranchInst>(Term), RHS, IfTrue);
+    CompareDesc Desc(cast<CondBrInst>(Term), RHS, IfTrue);
     while (auto *Zext = dyn_cast<ZExtInst>(LHS))
       LHS = Zext->getOperand(0);
     CandidatesULT[LHS].push_back(Desc);
@@ -3474,7 +3469,7 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
       // the successors is necessarily duplicated, so don't even try to remove
       // its cost.
       if (!FullUnswitch) {
-        auto &BI = cast<BranchInst>(TI);
+        auto &BI = cast<CondBrInst>(TI);
         Value *Cond = skipTrivialSelect(BI.getCondition());
         if (match(Cond, m_LogicalAnd())) {
           if (SuccBB == BI.getSuccessor(1))
@@ -3518,7 +3513,7 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
   for (auto &Candidate : UnswitchCandidates) {
     Instruction &TI = *Candidate.TI;
     ArrayRef<Value *> Invariants = Candidate.Invariants;
-    BranchInst *BI = dyn_cast<BranchInst>(&TI);
+    CondBrInst *BI = dyn_cast<CondBrInst>(&TI);
     bool FullUnswitch =
         !BI || Candidate.hasPendingInjection() ||
         (Invariants.size() == 1 &&
@@ -3558,7 +3553,7 @@ static NonTrivialUnswitchCandidate findBestNonTrivialUnswitchCandidate(
 // 3. The branch condition may be poison or undef
 static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT,
                                AssumptionCache &AC) {
-  assert(isa<BranchInst>(TI) || isa<SwitchInst>(TI));
+  assert(isa<CondBrInst>(TI) || isa<SwitchInst>(TI));
   if (!FreezeLoopUnswitchCond)
     return false;
 
@@ -3568,7 +3563,7 @@ static bool shouldInsertFreeze(Loop &L, Instruction &TI, DominatorTree &DT,
     return false;
 
   Value *Cond;
-  if (BranchInst *BI = dyn_cast<BranchInst>(&TI))
+  if (CondBrInst *BI = dyn_cast<CondBrInst>(&TI))
     Cond = skipTrivialSelect(BI->getCondition());
   else
     Cond = skipTrivialSelect(cast<SwitchInst>(&TI)->getCondition());
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index edcdaea4c31da..debf033c70c41 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -148,7 +148,7 @@ performBlockTailMerging(Function &F, ArrayRef<BasicBlock *> BBs,
 
     // And turn BB into a block that just unconditionally branches
     // to the canonical block.
-    Instruction *BI = BranchInst::Create(CanonicalBB, BB);
+    Instruction *BI = UncondBrInst::Create(CanonicalBB, BB);
     BI->setDebugLoc(Term->getDebugLoc());
     Term->eraseFromParent();
 
@@ -380,9 +380,13 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
     DT = &AM.getResult<DominatorTreeAnalysis>(F);
   if (!simplifyFunctionCFG(F, TTI, DT, Options))
     return PreservedAnalyses::all();
+  // If we removed some blocks, update block numbers to keep dense numbering.
+  F.renumberBlocks();
   PreservedAnalyses PA;
-  if (RequireAndPreserveDomTree)
+  if (RequireAndPreserveDomTree) {
+    DT->updateBlockNumbers();
     PA.preserve<DominatorTreeAnalysis>();
+  }
   return PA;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index 23e1243def290..0a2a3f8a495bf 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -165,12 +165,10 @@ bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
 }
 
 bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
-  BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
-  if (BI == nullptr)
+  CondBrInst *BI = dyn_cast<CondBrInst>(B.getTerminator());
+  if (!BI)
     return false;
 
-  if (BI->getNumSuccessors() != 2)
-    return false;
   BasicBlock &Succ0 = *BI->getSuccessor(0);
   BasicBlock &Succ1 = *BI->getSuccessor(1);
 
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 0a8f5ea2fdae1..136cb965f94c5 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -78,7 +78,7 @@ using BBValuePair = std::pair<BasicBlock *, Value *>;
 
 using RNVector = SmallVector<RegionNode *, 8>;
 using BBVector = SmallVector<BasicBlock *, 8>;
-using BranchVector = SmallVector<BranchInst *, 8>;
+using BranchVector = SmallVector<CondBrInst *, 8>;
 using BBValueVector = SmallVector<BBValuePair, 2>;
 
 using BBSet = SmallPtrSet<BasicBlock *, 8>;
@@ -97,9 +97,7 @@ class CondBranchWeights {
   CondBranchWeights(uint32_t T, uint32_t F) : TrueWeight(T), FalseWeight(F) {}
 
 public:
-  static MaybeCondBranchWeights tryParse(const BranchInst &Br) {
-    assert(Br.isConditional());
-
+  static MaybeCondBranchWeights tryParse(const CondBrInst &Br) {
     uint64_t T, F;
     if (!extractBranchWeights(Br, T, F))
       return std::nullopt;
@@ -107,9 +105,8 @@ class CondBranchWeights {
     return CondBranchWeights(T, F);
   }
 
-  static void setMetadata(BranchInst &Br,
+  static void setMetadata(CondBrInst &Br,
                           const MaybeCondBranchWeights &Weights) {
-    assert(Br.isConditional());
     if (!Weights)
       return;
     uint32_t Arr[] = {Weights->TrueWeight, Weights->FalseWeight};
@@ -316,7 +313,7 @@ class StructurizeCFG {
 
   void analyzeLoops(RegionNode *N);
 
-  PredInfo buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+  PredInfo buildCondition(CondBrInst *Term, unsigned Idx, bool Invert);
 
   void gatherPredicates(RegionNode *N);
 
@@ -479,6 +476,13 @@ void StructurizeCFG::hoistZeroCostElseBlockPhiValues(BasicBlock *ElseBB,
 
   if (!ElseSucc || !CommonDominator)
     return;
+  // Only hoist in a simple if-else: ThenBB must branch directly to ElseSucc
+  // and ElseSucc must have exactly 2 predecessors (ThenBB and ElseBB).
+  // simplifyHoistedPhis assumes this exact shape; with additional predecessors
+  // the hoisted value leaks into unrelated control-flow paths.
+  if (ThenBB->getSingleSuccessor() != ElseSucc ||
+      !ElseSucc->hasNPredecessors(2))
+    return;
   Instruction *Term = CommonDominator->getTerminator();
   for (PHINode &Phi : ElseSucc->phis()) {
     Value *ElseVal = Phi.getIncomingValueForBlock(ElseBB);
@@ -557,29 +561,24 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
 
   } else {
     // Test for successors as back edge
+    // TODO: support other terminators other than branches.
     BasicBlock *BB = N->getNodeAs<BasicBlock>();
-    if (BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator()))
-      for (BasicBlock *Succ : Term->successors())
+    if (isa<UncondBrInst, CondBrInst>(BB->getTerminator()))
+      for (BasicBlock *Succ : successors(BB))
         if (Visited.count(Succ))
           Loops[Succ] = BB;
   }
 }
 
 /// Build the condition for one edge
-PredInfo StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+PredInfo StructurizeCFG::buildCondition(CondBrInst *Term, unsigned Idx,
                                         bool Invert) {
-  Value *Cond = Invert ? BoolFalse : BoolTrue;
-  MaybeCondBranchWeights Weights;
-
-  if (Term->isConditional()) {
-    Cond = Term->getCondition();
-    Weights = CondBranchWeights::tryParse(*Term);
-
-    if (Idx != (unsigned)Invert) {
-      Cond = invertCondition(Cond);
-      if (Weights)
-        Weights = Weights->invert();
-    }
+  Value *Cond = Term->getCondition();
+  auto Weights = CondBranchWeights::tryParse(*Term);
+  if (Idx != (unsigned)Invert) {
+    Cond = invertCondition(Cond);
+    if (Weights)
+      Weights = Weights->invert();
   }
   return {Cond, Weights};
 }
@@ -593,35 +592,32 @@ void StructurizeCFG::gatherPredicates(RegionNode *N) {
 
   for (BasicBlock *P : predecessors(BB)) {
     // Ignore it if it's a branch from outside into our region entry
-    if (!ParentRegion->contains(P) || !dyn_cast<BranchInst>(P->getTerminator()))
+    if (!ParentRegion->contains(P))
       continue;
 
     Region *R = RI->getRegionFor(P);
     if (R == ParentRegion) {
-      // It's a top level block in our region
-      BranchInst *Term = cast<BranchInst>(P->getTerminator());
-      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-        BasicBlock *Succ = Term->getSuccessor(i);
-        if (Succ != BB)
-          continue;
-
+      if (isa<UncondBrInst>(P->getTerminator())) {
+        if (Visited.count(P))
+          Pred[P] = {BoolTrue, std::nullopt};
+        else
+          LPred[P] = {BoolFalse, std::nullopt};
+      } else if (auto *CondBr = dyn_cast<CondBrInst>(P->getTerminator())) {
+        bool Idx = CondBr->getSuccessor(0) == BB ? 0 : 1;
         if (Visited.count(P)) {
           // Normal forward edge
-          if (Term->isConditional()) {
-            // Try to treat it like an ELSE block
-            BasicBlock *Other = Term->getSuccessor(!i);
-            if (Visited.count(Other) && !Loops.count(Other) &&
-                !Pred.count(Other) && !Pred.count(P)) {
-              hoistZeroCostElseBlockPhiValues(Succ, Other);
-              Pred[Other] = {BoolFalse, std::nullopt};
-              Pred[P] = {BoolTrue, std::nullopt};
-              continue;
-            }
-          }
-          Pred[P] = buildCondition(Term, i, false);
+          // Try to treat Other like an ELSE block
+          BasicBlock *Other = CondBr->getSuccessor(!Idx);
+          if (Visited.count(Other) && !Loops.count(Other) &&
+              !Pred.count(Other) && !Pred.count(P)) {
+            hoistZeroCostElseBlockPhiValues(BB, Other);
+            Pred[Other] = {BoolFalse, std::nullopt};
+            Pred[P] = {BoolTrue, std::nullopt};
+          } else
+            Pred[P] = buildCondition(CondBr, Idx, false);
         } else {
           // Back edge
-          LPred[P] = buildCondition(Term, i, true);
+          LPred[P] = buildCondition(CondBr, Idx, true);
         }
       }
     } else {
@@ -675,9 +671,7 @@ void StructurizeCFG::insertConditions(bool Loops, SSAUpdaterBulk &PhiInserter) {
   BranchVector &Conds = Loops ? LoopConds : Conditions;
   Value *Default = Loops ? BoolTrue : BoolFalse;
 
-  for (BranchInst *Term : Conds) {
-    assert(Term->isConditional());
-
+  for (CondBrInst *Term : Conds) {
     BasicBlock *Parent = Term->getParent();
     BasicBlock *SuccTrue = Term->getSuccessor(0);
     BasicBlock *SuccFalse = Term->getSuccessor(1);
@@ -1079,7 +1073,7 @@ void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
   } else {
     BasicBlock *BB = Node->getNodeAs<BasicBlock>();
     DebugLoc DL = killTerminator(BB);
-    BranchInst *Br = BranchInst::Create(NewExit, BB);
+    UncondBrInst *Br = UncondBrInst::Create(NewExit, BB);
     Br->setDebugLoc(DL);
     addPhiValues(BB, NewExit);
     if (IncludeDominator)
@@ -1188,7 +1182,7 @@ void StructurizeCFG::wireFlow(bool ExitUseAllowed,
     BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
 
     // let it point to entry and next block
-    BranchInst *Br = BranchInst::Create(Entry, Next, BoolPoison, Flow);
+    CondBrInst *Br = CondBrInst::Create(BoolPoison, Entry, Next, Flow);
     Br->setDebugLoc(DL);
     Conditions.push_back(Br);
     addPhiValues(Flow, Entry);
@@ -1230,7 +1224,7 @@ void StructurizeCFG::handleLoops(bool ExitUseAllowed,
   DebugLoc DL;
   std::tie(LoopEnd, DL) = needPrefix(false);
   BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
-  BranchInst *Br = BranchInst::Create(Next, LoopStart, BoolPoison, LoopEnd);
+  CondBrInst *Br = CondBrInst::Create(BoolPoison, Next, LoopStart, LoopEnd);
   Br->setDebugLoc(DL);
   LoopConds.push_back(Br);
   addPhiValues(LoopEnd, LoopStart);
@@ -1304,8 +1298,8 @@ static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
 
   for (auto *E : R->elements()) {
     if (!E->isSubRegion()) {
-      auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
-      if (!Br || !Br->isConditional())
+      auto Br = dyn_cast<CondBrInst>(E->getEntry()->getTerminator());
+      if (!Br)
         continue;
 
       if (!UA.isUniform(Br))
@@ -1327,8 +1321,8 @@ static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
       // subregions are uniform or not. However, this requires a very careful
       // look at SIAnnotateControlFlow to make sure nothing breaks there.
       for (auto *BB : E->getNodeAs<Region>()->blocks()) {
-        auto Br = dyn_cast<BranchInst>(BB->getTerminator());
-        if (!Br || !Br->isConditional())
+        auto Br = dyn_cast<CondBrInst>(BB->getTerminator());
+        if (!Br)
           continue;
 
         if (!Br->getMetadata(UniformMDKindID)) {
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 89d41f3e40de7..3f7eca1fc7513 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -525,7 +525,7 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
   BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
   NewEntry->takeName(HeaderBB);
   HeaderBB->setName("tailrecurse");
-  auto *BI = BranchInst::Create(HeaderBB, NewEntry);
+  auto *BI = UncondBrInst::Create(HeaderBB, NewEntry);
   BI->setDebugLoc(DebugLoc::getCompilerGenerated());
   // If the new branch preserves the debug location of CI, it could result in
   // misleading stepping, if CI is located in a conditional branch.
@@ -749,7 +749,7 @@ bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
 
   // Now that all of the PHI nodes are in place, remove the call and
   // ret instructions, replacing them with an unconditional branch.
-  BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret->getIterator());
+  UncondBrInst *NewBI = UncondBrInst::Create(HeaderBB, Ret->getIterator());
   NewBI->setDebugLoc(CI->getDebugLoc());
 
   Ret->eraseFromParent();  // Remove return.
@@ -860,11 +860,8 @@ void TailRecursionEliminator::cleanupAndFinalize() {
 bool TailRecursionEliminator::processBlock(BasicBlock &BB) {
   Instruction *TI = BB.getTerminator();
 
-  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-    if (BI->isConditional())
-      return false;
-
-    BasicBlock *Succ = BI->getSuccessor(0);
+  if (UncondBrInst *BI = dyn_cast<UncondBrInst>(TI)) {
+    BasicBlock *Succ = BI->getSuccessor();
     ReturnInst *Ret = dyn_cast<ReturnInst>(Succ->getFirstNonPHIOrDbg(true));
 
     if (!Ret)
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 0d85b16ef1c31..c52d62985677b 100644
--- a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -119,7 +119,7 @@ static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
   Builder.SetInsertPoint(Prev);
   auto CmpNull =
       Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType()));
-  BranchInst::Create(Join, While, CmpNull, Prev);
+  Builder.CreateCondBr(CmpNull, Join, While);
 
   // Entry to the while loop.
   Builder.SetInsertPoint(While);
@@ -141,7 +141,7 @@ static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
   Len = Builder.CreateAdd(Len, One);
 
   // Final join.
-  BranchInst::Create(Join, WhileDone);
+  UncondBrInst::Create(Join, WhileDone);
   Builder.SetInsertPoint(Join, Join->begin());
   auto LenPhi = Builder.CreatePHI(Len->getType(), 2);
   LenPhi->addIncoming(Len, WhileDone);
@@ -459,7 +459,7 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
     BasicBlock *ArgPush = BasicBlock::Create(
         Ctx, "argpush.block", Builder.GetInsertBlock()->getParent());
 
-    BranchInst::Create(ArgPush, End, Cmp, Builder.GetInsertBlock());
+    CondBrInst::Create(Cmp, ArgPush, End, Builder.GetInsertBlock());
     Builder.SetInsertPoint(ArgPush);
 
     // Create controlDWord and store as the first entry, format as follows
@@ -512,7 +512,7 @@ Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder, ArrayRef<Value *> Args,
                               IsConstFmtStr);
 
     // End block, returns -1 on failure
-    BranchInst::Create(End, ArgPush);
+    UncondBrInst::Create(End, ArgPush);
     Builder.SetInsertPoint(End);
     return Builder.CreateSExt(Builder.CreateNot(Cmp), Int32Ty, "printf_result");
   }
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 6472e1771ec73..17aa7cc185b66 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -250,16 +250,16 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
 
   // Currently only allow PredBB to have two predecessors, one being BB.
   // Update BI to branch to BB's only successor instead of BB.
-  BranchInst *PredBB_BI;
+  CondBrInst *PredBB_BI;
   BasicBlock *NewSucc = nullptr;
   unsigned FallThruPath;
   if (PredecessorWithTwoSuccessors) {
-    if (!(PredBB_BI = dyn_cast<BranchInst>(PTI)))
+    if (!(PredBB_BI = dyn_cast<CondBrInst>(PTI)))
       return false;
-    BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BB_JmpI || !BB_JmpI->isUnconditional())
+    UncondBrInst *BB_JmpI = dyn_cast<UncondBrInst>(BB->getTerminator());
+    if (!BB_JmpI)
       return false;
-    NewSucc = BB_JmpI->getSuccessor(0);
+    NewSucc = BB_JmpI->getSuccessor();
     FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1;
   }
 
@@ -346,6 +346,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   if (PredecessorWithTwoSuccessors) {
     // Delete the unconditional branch from BB.
     BB->back().eraseFromParent();
+    // Add unreachable to now empty BB.
+    new UnreachableInst(BB->getContext(), BB);
 
     // Update branch in the predecessor.
     PredBB_BI->setSuccessor(FallThruPath, NewSucc);
@@ -355,6 +357,8 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
 
     // Move terminator instruction.
     BB->back().moveBeforePreserving(*PredBB, PredBB->end());
+    // Add unreachable to now empty BB.
+    new UnreachableInst(BB->getContext(), BB);
 
     // Terminator may be a memory accessing instruction too.
     if (MSSAU)
@@ -362,8 +366,6 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
               MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator())))
         MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End);
   }
-  // Add unreachable to now empty BB.
-  new UnreachableInst(BB->getContext(), BB);
 
   // Inherit predecessors name if it exists.
   if (!PredBB->hasName())
@@ -746,7 +748,7 @@ BasicBlock *llvm::SplitCallBrEdge(BasicBlock *CallBrBlock, BasicBlock *Succ,
   // Rewire control flow from callbr to the new target block.
   CallBr->setSuccessor(SuccIdx, CallBrTarget);
   // Jump from the new target block to the original successor.
-  BranchInst::Create(Succ, CallBrTarget);
+  UncondBrInst::Create(Succ, CallBrTarget);
 
   bool Updated =
       updateCycleLoopInfo<LoopInfo, Loop>(LI, CallBrBlock, CallBrTarget, Succ);
@@ -853,7 +855,7 @@ BasicBlock *llvm::ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ,
 
   if (LandingPadReplacement) {
     auto *NewLP = OriginalPad->clone();
-    auto *Terminator = BranchInst::Create(Succ, NewBB);
+    auto *Terminator = UncondBrInst::Create(Succ, NewBB);
     NewLP->insertBefore(Terminator->getIterator());
     LandingPadReplacement->addIncoming(NewLP, NewBB);
   } else {
@@ -1190,7 +1192,7 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old,
 /// Update the PHI nodes in OrigBB to include the values coming from NewBB.
 /// This also updates AliasAnalysis, if available.
 static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
-                           ArrayRef<BasicBlock *> Preds, BranchInst *BI,
+                           ArrayRef<BasicBlock *> Preds, Instruction *BI,
                            bool HasLoopExit) {
   // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
   SmallPtrSet<BasicBlock *, 16> PredSet(llvm::from_range, Preds);
@@ -1283,7 +1285,7 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
       BB->getContext(), BB->getName() + Suffix, BB->getParent(), BB);
 
   // The new block unconditionally branches to the old block.
-  BranchInst *BI = BranchInst::Create(BB, NewBB);
+  UncondBrInst *BI = UncondBrInst::Create(BB, NewBB);
 
   Loop *L = nullptr;
   BasicBlock *OldLatch = nullptr;
@@ -1381,7 +1383,7 @@ static void SplitLandingPadPredecessorsImpl(
   NewBBs.push_back(NewBB1);
 
   // The new block unconditionally branches to the old block.
-  BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
+  UncondBrInst *BI1 = UncondBrInst::Create(OrigBB, NewBB1);
   BI1->setDebugLoc(OrigBB->getFirstNonPHIIt()->getDebugLoc());
 
   // Move the edges from Preds to point to NewBB1 instead of OrigBB.
@@ -1422,7 +1424,7 @@ static void SplitLandingPadPredecessorsImpl(
     NewBBs.push_back(NewBB2);
 
     // The new block unconditionally branches to the old block.
-    BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
+    UncondBrInst *BI2 = UncondBrInst::Create(OrigBB, NewBB2);
     BI2->setDebugLoc(OrigBB->getFirstNonPHIIt()->getDebugLoc());
 
     // Move the remaining edges from OrigBB to point to NewBB2.
@@ -1617,7 +1619,7 @@ void llvm::SplitBlockAndInsertIfThenElse(
       if (Unreachable)
         (void)new UnreachableInst(C, BB);
       else {
-        (void)BranchInst::Create(Tail, BB);
+        (void)UncondBrInst::Create(Tail, BB);
         ToTailEdge = true;
       }
       BB->getTerminator()->setDebugLoc(SplitBefore->getDebugLoc());
@@ -1630,8 +1632,7 @@ void llvm::SplitBlockAndInsertIfThenElse(
   handleBlock(ElseBlock, UnreachableElse, FalseBlock, ElseToTailEdge);
 
   Instruction *HeadOldTerm = Head->getTerminator();
-  BranchInst *HeadNewTerm =
-      BranchInst::Create(/*ifTrue*/ TrueBlock, /*ifFalse*/ FalseBlock, Cond);
+  CondBrInst *HeadNewTerm = CondBrInst::Create(Cond, TrueBlock, FalseBlock);
   HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
   ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
 
@@ -1733,7 +1734,7 @@ void llvm::SplitBlockAndInsertForEachLane(
   }
 }
 
-BranchInst *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
+CondBrInst *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
                                  BasicBlock *&IfFalse) {
   PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
   BasicBlock *Pred1 = nullptr;
@@ -1756,28 +1757,29 @@ BranchInst *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
       return nullptr;
   }
 
-  // We can only handle branches.  Other control flow will be lowered to
-  // branches if possible anyway.
-  BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
-  BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
-  if (!Pred1Br || !Pred2Br)
-    return nullptr;
+  Instruction *Pred1Term = Pred1->getTerminator();
+  Instruction *Pred2Term = Pred2->getTerminator();
 
   // Eliminate code duplication by ensuring that Pred1Br is conditional if
   // either are.
-  if (Pred2Br->isConditional()) {
+  if (isa<CondBrInst>(Pred2Term)) {
     // If both branches are conditional, we don't have an "if statement".  In
     // reality, we could transform this case, but since the condition will be
     // required anyway, we stand no chance of eliminating it, so the xform is
     // probably not profitable.
-    if (Pred1Br->isConditional())
+    if (isa<CondBrInst>(Pred1Term))
       return nullptr;
 
     std::swap(Pred1, Pred2);
-    std::swap(Pred1Br, Pred2Br);
+    std::swap(Pred1Term, Pred2Term);
   }
 
-  if (Pred1Br->isConditional()) {
+  // We can only handle branches.  Other control flow will be lowered to
+  // branches if possible anyway.
+  if (!isa<UncondBrInst>(Pred2Term))
+    return nullptr;
+
+  if (auto *Pred1Br = dyn_cast<CondBrInst>(Pred1Term)) {
     // The only thing we have to watch out for here is to make sure that Pred2
     // doesn't have incoming edges from other blocks.  If it does, the condition
     // doesn't dominate BB.
@@ -1803,6 +1805,9 @@ BranchInst *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
     return Pred1Br;
   }
 
+  if (!isa<UncondBrInst>(Pred1Term))
+    return nullptr;
+
   // Ok, if we got here, both predecessors end with an unconditional branch to
   // BB.  Don't panic!  If both blocks only have a single (identical)
   // predecessor, and THAT is a conditional branch, then we're all ok!
@@ -1811,10 +1816,9 @@ BranchInst *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
     return nullptr;
 
   // Otherwise, if this is a conditional branch, then we can use it!
-  BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
+  CondBrInst *BI = dyn_cast<CondBrInst>(CommonPred->getTerminator());
   if (!BI) return nullptr;
 
-  assert(BI->isConditional() && "Two successors but not conditional?");
   if (BI->getSuccessor(0) == Pred1) {
     IfTrue = Pred1;
     IfFalse = Pred2;
@@ -1825,7 +1829,7 @@ BranchInst *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
   return BI;
 }
 
-void llvm::InvertBranch(BranchInst *PBI, IRBuilderBase &Builder) {
+void llvm::InvertBranch(CondBrInst *PBI, IRBuilderBase &Builder) {
   Value *NewCond = PBI->getCondition();
   // If this is a "cmp" instruction, only used for branching (and nowhere
   // else), then we can simply invert the predicate.
@@ -1842,8 +1846,7 @@ void llvm::InvertBranch(BranchInst *PBI, IRBuilderBase &Builder) {
 bool llvm::hasOnlySimpleTerminator(const Function &F) {
   for (auto &BB : F) {
     auto *Term = BB.getTerminator();
-    if (!(isa<ReturnInst>(Term) || isa<UnreachableInst>(Term) ||
-          isa<BranchInst>(Term)))
+    if (!isa<ReturnInst, UnreachableInst, UncondBrInst, CondBrInst>(Term))
       return false;
   }
   return true;
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 78f2ab5dffa98..32d3886284a6c 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -173,7 +173,7 @@ llvm::SplitKnownCriticalEdge(Instruction *TI, unsigned SuccNum,
                                                      DestBB->getName() +
                                                      "_crit_edge");
   // Create our unconditional branch.
-  BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
+  UncondBrInst *NewBI = UncondBrInst::Create(DestBB, NewBB);
   NewBI->setDebugLoc(TI->getDebugLoc());
   if (auto *LoopMD = TI->getMetadata(LLVMContext::MD_loop))
     NewBI->setMetadata(LLVMContext::MD_loop, LoopMD);
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 192d313b23798..031ce978d21ce 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -644,25 +644,23 @@ void PruningFunctionCloner::CloneBlock(
   // Finally, clone over the terminator.
   const Instruction *OldTI = BB->getTerminator();
   bool TerminatorDone = false;
-  if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
-    if (BI->isConditional()) {
-      // If the condition was a known constant in the callee...
-      ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
-      // Or is a known constant in the caller...
-      if (!Cond) {
-        Value *V = VMap.lookup(BI->getCondition());
-        Cond = dyn_cast_or_null<ConstantInt>(V);
-      }
+  if (const CondBrInst *BI = dyn_cast<CondBrInst>(OldTI)) {
+    // If the condition was a known constant in the callee...
+    ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+    // Or is a known constant in the caller...
+    if (!Cond) {
+      Value *V = VMap.lookup(BI->getCondition());
+      Cond = dyn_cast_or_null<ConstantInt>(V);
+    }
 
-      // Constant fold to uncond branch!
-      if (Cond) {
-        BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
-        auto *NewBI = BranchInst::Create(Dest, NewBB);
-        NewBI->setDebugLoc(BI->getDebugLoc());
-        VMap[OldTI] = NewBI;
-        ToClone.push_back(Dest);
-        TerminatorDone = true;
-      }
+    // Constant fold to uncond branch!
+    if (Cond) {
+      BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
+      auto *NewBI = UncondBrInst::Create(Dest, NewBB);
+      NewBI->setDebugLoc(BI->getDebugLoc());
+      VMap[OldTI] = NewBI;
+      ToClone.push_back(Dest);
+      TerminatorDone = true;
     }
   } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
     // If switching on a value known constant in the caller.
@@ -674,7 +672,7 @@ void PruningFunctionCloner::CloneBlock(
     if (Cond) { // Constant fold to uncond branch!
       SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
       BasicBlock *Dest = const_cast<BasicBlock *>(Case.getCaseSuccessor());
-      auto *NewBI = BranchInst::Create(Dest, NewBB);
+      auto *NewBI = UncondBrInst::Create(Dest, NewBB);
       NewBI->setDebugLoc(SI->getDebugLoc());
       VMap[OldTI] = NewBI;
       ToClone.push_back(Dest);
@@ -959,13 +957,13 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // uncond branches, and this code folds them.
   Function::iterator I = Begin;
   while (I != NewFunc->end()) {
-    BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
-    if (!BI || BI->isConditional()) {
+    UncondBrInst *BI = dyn_cast<UncondBrInst>(I->getTerminator());
+    if (!BI) {
       ++I;
       continue;
     }
 
-    BasicBlock *Dest = BI->getSuccessor(0);
+    BasicBlock *Dest = BI->getSuccessor();
     if (!Dest->getSinglePredecessor() || Dest->hasAddressTaken()) {
       ++I;
       continue;
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index b298a8ae144d8..db4d2011687af 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -791,7 +791,7 @@ void CodeExtractor::severSplitPHINodesOfExits() {
         for (BasicBlock *PredBB : Preds)
           if (Blocks.count(PredBB))
             PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB);
-        BranchInst::Create(ExitBB, NewBB);
+        UncondBrInst::Create(ExitBB, NewBB);
         Blocks.insert(NewBB);
       }
 
@@ -1742,7 +1742,7 @@ void CodeExtractor::emitFunctionBody(
   }
 
   // Connect newFunction entry block to new header.
-  BranchInst *BranchI = BranchInst::Create(header, newFuncRoot);
+  UncondBrInst *BranchI = UncondBrInst::Create(header, newFuncRoot);
   applyFirstDebugLoc(oldFunction, Blocks.getArrayRef(), BranchI);
 
   // Store the arguments right after the definition of output value.
@@ -1982,15 +1982,15 @@ CallInst *CodeExtractor::emitReplacerCall(
   case 1:
     // Only a single destination, change the switch into an unconditional
     // branch.
-    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getIterator());
+    UncondBrInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getIterator());
     TheSwitch->eraseFromParent();
     break;
   case 2:
     // Only two destinations, convert to a condition branch.
     // Remark: This also swaps the target branches:
     // 0 -> false -> getSuccessor(2); 1 -> true -> getSuccessor(1)
-    BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
-                       call, TheSwitch->getIterator());
+    CondBrInst::Create(call, TheSwitch->getSuccessor(1),
+                       TheSwitch->getSuccessor(2), TheSwitch->getIterator());
     TheSwitch->eraseFromParent();
     break;
   default:
diff --git a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
index 8de83cc3700cc..cf3bacc08f5a5 100644
--- a/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ControlFlowUtils.cpp
@@ -34,27 +34,25 @@ using EdgeDescriptor = ControlFlowHub::BranchDescriptor;
 //   branch to the FirstGuardBlock.
 static Value *redirectToHub(BasicBlock *BB, BasicBlock *Succ0,
                             BasicBlock *Succ1, BasicBlock *FirstGuardBlock) {
-  assert(isa<BranchInst>(BB->getTerminator()) &&
-         "Only support branch terminator.");
-  auto *Branch = cast<BranchInst>(BB->getTerminator());
-  auto *Condition = Branch->isConditional() ? Branch->getCondition() : nullptr;
-
-  assert(Succ0 || Succ1);
-
-  if (Branch->isUnconditional()) {
+  if (auto *Branch = dyn_cast<UncondBrInst>(BB->getTerminator())) {
     assert(Succ0 == Branch->getSuccessor(0));
     assert(!Succ1);
+    Branch->setSuccessor(FirstGuardBlock);
+    return nullptr;
+  }
+
+  auto *Branch = cast<CondBrInst>(BB->getTerminator());
+  auto *Condition = Branch->getCondition();
+
+  assert(Succ0 || Succ1);
+  assert(!Succ1 || Succ1 == Branch->getSuccessor(1));
+  if (Succ0 && !Succ1) {
     Branch->setSuccessor(0, FirstGuardBlock);
+  } else if (Succ1 && !Succ0) {
+    Branch->setSuccessor(1, FirstGuardBlock);
   } else {
-    assert(!Succ1 || Succ1 == Branch->getSuccessor(1));
-    if (Succ0 && !Succ1) {
-      Branch->setSuccessor(0, FirstGuardBlock);
-    } else if (Succ1 && !Succ0) {
-      Branch->setSuccessor(1, FirstGuardBlock);
-    } else {
-      Branch->eraseFromParent();
-      BranchInst::Create(FirstGuardBlock, BB);
-    }
+    Branch->eraseFromParent();
+    UncondBrInst::Create(FirstGuardBlock, BB);
   }
 
   return Condition;
@@ -73,11 +71,11 @@ static void setupBranchForGuard(ArrayRef<BasicBlock *> GuardBlocks,
   int I = 0;
   for (int E = GuardBlocks.size() - 1; I != E; ++I) {
     BasicBlock *Out = Outgoing[I];
-    BranchInst::Create(Out, GuardBlocks[I + 1], GuardPredicates[Out],
+    CondBrInst::Create(GuardPredicates[Out], Out, GuardBlocks[I + 1],
                        GuardBlocks[I]);
   }
   BasicBlock *Out = Outgoing[I];
-  BranchInst::Create(Out, Outgoing[I + 1], GuardPredicates[Out],
+  CondBrInst::Create(GuardPredicates[Out], Out, Outgoing[I + 1],
                      GuardBlocks[I]);
 }
 
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index b2ee1da143ba7..2b3dfdc4f01f5 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -526,16 +526,13 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst, BasicBlock *&NextBB,
     } else if (CurInst->isTerminator()) {
       LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n");
 
-      if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
-        if (BI->isUnconditional()) {
-          NextBB = BI->getSuccessor(0);
-        } else {
-          ConstantInt *Cond =
-            dyn_cast<ConstantInt>(getVal(BI->getCondition()));
-          if (!Cond) return false;  // Cannot determine.
-
-          NextBB = BI->getSuccessor(!Cond->getZExtValue());
-        }
+      if (UncondBrInst *BI = dyn_cast<UncondBrInst>(CurInst)) {
+        NextBB = BI->getSuccessor(0);
+      } else if (CondBrInst *BI = dyn_cast<CondBrInst>(CurInst)) {
+        ConstantInt *Cond = dyn_cast<ConstantInt>(getVal(BI->getCondition()));
+        if (!Cond)
+          return false; // Cannot determine.
+        NextBB = BI->getSuccessor(!Cond->getZExtValue());
       } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
         ConstantInt *Val =
           dyn_cast<ConstantInt>(getVal(SI->getCondition()));
diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
index 804af22daa5af..8e6425adc2855 100644
--- a/llvm/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -290,7 +290,13 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
   }
 
   for (BasicBlock *P : Predecessors) {
-    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator())) {
+    if (isa<UncondBrInst>(P->getTerminator())) {
+      assert(P->getTerminator()->getSuccessor(0) == Header);
+      CHub.addBranch(P, Header);
+
+      LLVM_DEBUG(dbgs() << "Added internal branch: " << printBasicBlock(P)
+                        << " -> " << printBasicBlock(Header) << '\n');
+    } else if (CondBrInst *Branch = dyn_cast<CondBrInst>(P->getTerminator())) {
       // Exactly one of the two successors is the header.
       BasicBlock *Succ0 = Branch->getSuccessor(0) == Header ? Header : nullptr;
       BasicBlock *Succ1 = Succ0 ? nullptr : Header;
@@ -328,12 +334,18 @@ static bool fixIrreducible(Cycle &C, CycleInfo &CI, DominatorTree &DT,
   }
 
   for (BasicBlock *P : Predecessors) {
-    if (BranchInst *Branch = dyn_cast<BranchInst>(P->getTerminator()); Branch) {
+    if (UncondBrInst *Branch = dyn_cast<UncondBrInst>(P->getTerminator())) {
+      BasicBlock *Succ0 = Branch->getSuccessor();
+      Succ0 = C.contains(Succ0) ? Succ0 : nullptr;
+      CHub.addBranch(P, Succ0);
+
+      LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P)
+                        << " -> " << printBasicBlock(Succ0) << '\n');
+    } else if (CondBrInst *Branch = dyn_cast<CondBrInst>(P->getTerminator())) {
       BasicBlock *Succ0 = Branch->getSuccessor(0);
       Succ0 = C.contains(Succ0) ? Succ0 : nullptr;
-      BasicBlock *Succ1 =
-          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
-      Succ1 = Succ1 && C.contains(Succ1) ? Succ1 : nullptr;
+      BasicBlock *Succ1 = Branch->getSuccessor(1);
+      Succ1 = C.contains(Succ1) ? Succ1 : nullptr;
       CHub.addBranch(P, Succ0, Succ1);
 
       LLVM_DEBUG(dbgs() << "Added external branch: " << printBasicBlock(P)
diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 1d9408d6db433..2837fb57b44ac 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -146,15 +146,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   // Check predecessors of \param BB.
   SmallPtrSet<BasicBlock *, 16> Preds(llvm::from_range, predecessors(BB));
   for (BasicBlock *Pred : Preds) {
-    BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator());
-
-    // All predecessors should terminate with a branch.
-    if (!PBI)
-      return false;
-
     BasicBlock *PP = Pred->getSinglePredecessor();
 
-    if (PBI->isUnconditional()) {
+    if (isa<UncondBrInst>(Pred->getTerminator())) {
       // Case 1: Pred (BB3) is an unconditional block, it should
       // have a single predecessor (BB2) that is also a predecessor
       // of \param BB (BB4) and should not have address-taken.
@@ -169,7 +163,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     }
 
     // Only conditional branches are allowed beyond this point.
-    assert(PBI->isConditional());
+    CondBrInst *PBI = dyn_cast<CondBrInst>(Pred->getTerminator());
+    if (!PBI)
+      return false;
 
     // Condition's unique use should be the branch instruction.
     Value *PC = PBI->getCondition();
@@ -216,13 +212,9 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     if (!Preds.contains(PS)) {
       // Case 2.
       LastCondBlock = Pred;
-    } else {
-      // Case 1
-      BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator());
-      if (BPS && BPS->isUnconditional()) {
-        // Case 1: PS(BB3) should be an unconditional branch.
-        LastCondBlock = Pred;
-      }
+    } else if (isa<UncondBrInst>(PS->getTerminator())) {
+      // Case 1: PS(BB3) should be an unconditional branch.
+      LastCondBlock = Pred;
     }
   }
 
@@ -232,16 +224,14 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   Instruction *TBB = LastCondBlock->getTerminator();
   BasicBlock *PS1 = TBB->getSuccessor(0);
   BasicBlock *PS2 = TBB->getSuccessor(1);
-  BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
-  BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator());
+  UncondBrInst *PBI1 = dyn_cast<UncondBrInst>(PS1->getTerminator());
+  UncondBrInst *PBI2 = dyn_cast<UncondBrInst>(PS2->getTerminator());
 
   // If PS1 does not jump into PS2, but PS2 jumps into PS1,
   // attempt branch inversion.
-  if (!PBI1 || !PBI1->isUnconditional() ||
-      (PS1->getTerminator()->getSuccessor(0) != PS2)) {
+  if (!PBI1 || (PS1->getTerminator()->getSuccessor(0) != PS2)) {
     // Check whether PS2 jumps into PS1.
-    if (!PBI2 || !PBI2->isUnconditional() ||
-        (PS2->getTerminator()->getSuccessor(0) != PS1))
+    if (!PBI2 || (PS2->getTerminator()->getSuccessor(0) != PS1))
       return false;
 
     // Do branch inversion.
@@ -249,7 +239,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     bool EverChanged = false;
     for (; CurrBlock != FirstCondBlock;
          CurrBlock = CurrBlock->getSinglePredecessor()) {
-      auto *BI = cast<BranchInst>(CurrBlock->getTerminator());
+      auto *BI = cast<CondBrInst>(CurrBlock->getTerminator());
       auto *CI = dyn_cast<CmpInst>(BI->getCondition());
       if (!CI)
         continue;
@@ -266,7 +256,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   }
 
   // PS1 must have a conditional branch.
-  if (!PBI1 || !PBI1->isUnconditional())
+  if (!PBI1)
     return false;
 
   // PS2 should not contain PHI node.
@@ -276,7 +266,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
 
   // Do the transformation.
   BasicBlock *CB;
-  BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+  CondBrInst *PBI = cast<CondBrInst>(FirstCondBlock->getTerminator());
   bool Iteration = true;
   IRBuilder<>::InsertPointGuard Guard(Builder);
   Value *PC = PBI->getCondition();
@@ -286,7 +276,7 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
     // Delete the conditional branch.
     FirstCondBlock->back().eraseFromParent();
     FirstCondBlock->splice(FirstCondBlock->end(), CB);
-    PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+    PBI = cast<CondBrInst>(FirstCondBlock->getTerminator());
     Value *CC = PBI->getCondition();
     // Merge conditions.
     Builder.SetInsertPoint(PBI);
@@ -412,7 +402,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
     return false;
 
   BasicBlock *IfTrue2, *IfFalse2;
-  BranchInst *DomBI2 = GetIfCondition(BB, IfTrue2, IfFalse2);
+  CondBrInst *DomBI2 = GetIfCondition(BB, IfTrue2, IfFalse2);
   if (!DomBI2)
     return false;
   Instruction *CInst2 = dyn_cast<Instruction>(DomBI2->getCondition());
@@ -424,7 +414,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
     return false;
 
   BasicBlock *IfTrue1, *IfFalse1;
-  BranchInst *DomBI1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1);
+  CondBrInst *DomBI1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1);
   if (!DomBI1)
     return false;
   Instruction *CInst1 = dyn_cast<Instruction>(DomBI1->getCondition());
@@ -485,7 +475,7 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   // Merge \param SecondEntryBlock into \param FirstEntryBlock.
   FirstEntryBlock->back().eraseFromParent();
   FirstEntryBlock->splice(FirstEntryBlock->end(), SecondEntryBlock);
-  BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator());
+  CondBrInst *PBI = cast<CondBrInst>(FirstEntryBlock->getTerminator());
   assert(PBI->getCondition() == CInst2);
   BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
   BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
diff --git a/llvm/lib/Transforms/Utils/GuardUtils.cpp b/llvm/lib/Transforms/Utils/GuardUtils.cpp
index 46ad951d0a812..299518ebb630e 100644
--- a/llvm/lib/Transforms/Utils/GuardUtils.cpp
+++ b/llvm/lib/Transforms/Utils/GuardUtils.cpp
@@ -36,7 +36,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
   auto *DeoptBlockTerm =
       SplitBlockAndInsertIfThen(Guard->getArgOperand(0), Guard, true);
 
-  auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+  auto *CheckBI = cast<CondBrInst>(CheckBB->getTerminator());
 
   // SplitBlockAndInsertIfThen inserts control flow that branches to
   // DeoptBlockTerm if the condition is true.  We want the opposite.
@@ -78,8 +78,7 @@ void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
   }
 }
 
-
-void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) {
+void llvm::widenWidenableBranch(CondBrInst *WidenableBR, Value *NewCond) {
   assert(isWidenableBranch(WidenableBR) && "precondition");
 
   // The tempting trivially option is to produce something like this:
@@ -105,7 +104,7 @@ void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) {
   assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
 }
 
-void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) {
+void llvm::setWidenableBranchCond(CondBrInst *WidenableBR, Value *NewCond) {
   assert(isWidenableBranch(WidenableBR) && "precondition");
 
   Use *C, *WC;
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index a779a25f7378e..8866c99453318 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -240,7 +240,7 @@ void LandingPadInliningInfo::forwardResume(
   BasicBlock *Dest = getInnerResumeDest();
   BasicBlock *Src = RI->getParent();
 
-  auto *BI = BranchInst::Create(Dest, Src);
+  auto *BI = UncondBrInst::Create(Dest, Src);
   BI->setDebugLoc(RI->getDebugLoc());
 
   // Update the PHIs in the destination. They were inserted in an order which
@@ -1550,9 +1550,9 @@ static AttrBuilder IdentifyValidPoisonGeneratingAttributes(CallBase &CB) {
 
 static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap,
                                 ClonedCodeInfo &InlinedFunctionInfo) {
-  AttrBuilder ValidUB = IdentifyValidUBGeneratingAttributes(CB);
-  AttrBuilder ValidPG = IdentifyValidPoisonGeneratingAttributes(CB);
-  if (!ValidUB.hasAttributes() && !ValidPG.hasAttributes())
+  AttrBuilder CallSiteValidUB = IdentifyValidUBGeneratingAttributes(CB);
+  AttrBuilder CallSiteValidPG = IdentifyValidPoisonGeneratingAttributes(CB);
+  if (!CallSiteValidUB.hasAttributes() && !CallSiteValidPG.hasAttributes())
     return;
   auto *CalledFunction = CB.getCalledFunction();
   auto &Context = CalledFunction->getContext();
@@ -1600,6 +1600,8 @@ static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap,
     // with a differing value, the AttributeList's merge API honours the already
     // existing attribute value (i.e. attributes such as dereferenceable,
     // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
+    AttrBuilder ValidUB = IdentifyValidUBGeneratingAttributes(CB);
+    AttrBuilder ValidPG = IdentifyValidPoisonGeneratingAttributes(CB);
     AttributeList AL = NewRetVal->getAttributes();
     if (ValidUB.getDereferenceableBytes() < AL.getRetDereferenceableBytes())
       ValidUB.removeAttribute(Attribute::Dereferenceable);
@@ -3236,7 +3238,8 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
     // If the call site was an invoke instruction, add a branch to the normal
     // destination.
     if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
-      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), CB.getIterator());
+      UncondBrInst *NewBr =
+          UncondBrInst::Create(II->getNormalDest(), CB.getIterator());
       NewBr->setDebugLoc(Returns[0]->getDebugLoc());
     }
 
@@ -3269,11 +3272,12 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
   // "starter" and "ender" blocks.  How we accomplish this depends on whether
   // this is an invoke instruction or a call instruction.
   BasicBlock *AfterCallBB;
-  BranchInst *CreatedBranchToNormalDest = nullptr;
+  UncondBrInst *CreatedBranchToNormalDest = nullptr;
   if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
-    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), CB.getIterator());
+    CreatedBranchToNormalDest =
+        UncondBrInst::Create(II->getNormalDest(), CB.getIterator());
     // We intend to replace this DebugLoc with another later.
     CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getTemporary());
 
@@ -3301,10 +3305,8 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
   // Change the branch that used to go to AfterCallBB to branch to the first
   // basic block of the inlined function.
   //
-  Instruction *Br = OrigBB->getTerminator();
-  assert(Br && Br->getOpcode() == Instruction::UncondBr &&
-         "splitBasicBlock broken!");
-  Br->setOperand(0, &*FirstNewBlock);
+  UncondBrInst *Br = cast<UncondBrInst>(OrigBB->getTerminator());
+  Br->setSuccessor(&*FirstNewBlock);
 
   // Now that the function is correct, make it a little bit nicer.  In
   // particular, move the basic blocks inserted from the end of the function
@@ -3341,7 +3343,7 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
     // Add a branch to the merge points and remove return instructions.
     DebugLoc Loc;
     for (ReturnInst *RI : Returns) {
-      BranchInst *BI = BranchInst::Create(AfterCallBB, RI->getIterator());
+      UncondBrInst *BI = UncondBrInst::Create(AfterCallBB, RI->getIterator());
       Loc = RI->getDebugLoc();
       BI->setDebugLoc(Loc);
       RI->eraseFromParent();
@@ -3398,8 +3400,7 @@ void llvm::InlineFunctionImpl(CallBase &CB, InlineFunctionInfo &IFI,
 
   // We should always be able to fold the entry block of the function into the
   // single predecessor of the block...
-  assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
-  BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
+  BasicBlock *CalleeEntry = Br->getSuccessor();
 
   // Splice the code entry block into calling block, right before the
   // unconditional branch.
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index bd617cd003a76..84c0989a7fe07 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -138,9 +138,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
   IRBuilder<> Builder(T);
 
   // Branch - See if we are conditional jumping on constant
-  if (auto *BI = dyn_cast<BranchInst>(T)) {
-    if (BI->isUnconditional()) return false;  // Can't optimize uncond branch
-
+  if (auto *BI = dyn_cast<CondBrInst>(T)) {
     BasicBlock *Dest1 = BI->getSuccessor(0);
     BasicBlock *Dest2 = BI->getSuccessor(1);
 
@@ -154,7 +152,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       Dest1->removePredecessor(BI->getParent());
 
       // Replace the conditional branch with an unconditional one.
-      BranchInst *NewBI = Builder.CreateBr(Dest1);
+      UncondBrInst *NewBI = Builder.CreateBr(Dest1);
 
       // Transfer the metadata to the new branch instruction.
       NewBI->copyMetadata(*BI, {LLVMContext::MD_loop, LLVMContext::MD_dbg,
@@ -178,7 +176,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       OldDest->removePredecessor(BB);
 
       // Replace the conditional branch with an unconditional one.
-      BranchInst *NewBI = Builder.CreateBr(Destination);
+      UncondBrInst *NewBI = Builder.CreateBr(Destination);
 
       // Transfer the metadata to the new branch instruction.
       NewBI->copyMetadata(*BI, {LLVMContext::MD_loop, LLVMContext::MD_dbg,
@@ -317,9 +315,8 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           FirstCase.getCaseValue(), "cond");
 
       // Insert the new branch.
-      BranchInst *NewBr = Builder.CreateCondBr(Cond,
-                                               FirstCase.getCaseSuccessor(),
-                                               SI->getDefaultDest());
+      CondBrInst *NewBr = Builder.CreateCondBr(
+          Cond, FirstCase.getCaseSuccessor(), SI->getDefaultDest());
       SmallVector<uint32_t> Weights;
       if (extractBranchWeights(*SI, Weights) && Weights.size() == 2) {
         uint32_t DefWeight = Weights[0];
@@ -1161,7 +1158,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
          "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
 
   // We can't simplify infinite loops.
-  BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
+  BasicBlock *Succ = cast<UncondBrInst>(BB->getTerminator())->getSuccessor(0);
   if (BB == Succ)
     return false;
 
@@ -2602,7 +2599,7 @@ CallInst *llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
 
   // Follow the call by a branch to the normal destination.
   BasicBlock *NormalDestBB = II->getNormalDest();
-  auto *BI = BranchInst::Create(NormalDestBB, II->getIterator());
+  auto *BI = UncondBrInst::Create(NormalDestBB, II->getIterator());
   // Although it takes place after the call itself, the new branch is still
   // performing part of the control-flow functionality of the invoke, so we use
   // II's DebugLoc.
@@ -2661,13 +2658,12 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
   return Split;
 }
 
-static bool markAliveBlocks(Function &F,
-                            SmallPtrSetImpl<BasicBlock *> &Reachable,
+static bool markAliveBlocks(Function &F, SmallVectorImpl<bool> &Reachable,
                             DomTreeUpdater *DTU = nullptr) {
   SmallVector<BasicBlock*, 128> Worklist;
   BasicBlock *BB = &F.front();
   Worklist.push_back(BB);
-  Reachable.insert(BB);
+  Reachable[BB->getNumber()] = true;
   bool Changed = false;
   do {
     BB = Worklist.pop_back_val();
@@ -2773,6 +2769,7 @@ static bool markAliveBlocks(Function &F,
           BasicBlock *UnreachableNormalDest = BasicBlock::Create(
               Ctx, OrigNormalDest->getName() + ".unreachable",
               II->getFunction(), OrigNormalDest);
+          Reachable.resize(II->getFunction()->getMaxBlockNumber());
           auto *UI = new UnreachableInst(Ctx, UnreachableNormalDest);
           UI->setDebugLoc(DebugLoc::getTemporary());
           II->setNormalDest(UnreachableNormalDest);
@@ -2787,7 +2784,7 @@ static bool markAliveBlocks(Function &F,
             // jump to the normal destination branch.
             BasicBlock *NormalDestBB = II->getNormalDest();
             BasicBlock *UnwindDestBB = II->getUnwindDest();
-            BranchInst::Create(NormalDestBB, II->getIterator());
+            UncondBrInst::Create(NormalDestBB, II->getIterator());
             UnwindDestBB->removePredecessor(II->getParent());
             II->eraseFromParent();
             if (DTU)
@@ -2853,9 +2850,12 @@ static bool markAliveBlocks(Function &F,
     }
 
     Changed |= ConstantFoldTerminator(BB, true, nullptr, DTU);
-    for (BasicBlock *Successor : successors(BB))
-      if (Reachable.insert(Successor).second)
+    for (BasicBlock *Successor : successors(BB)) {
+      if (!Reachable[Successor->getNumber()]) {
         Worklist.push_back(Successor);
+        Reachable[Successor->getNumber()] = true;
+      }
+    }
   } while (!Worklist.empty());
   return Changed;
 }
@@ -2900,20 +2900,14 @@ Instruction *llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
 /// otherwise.
 bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
                                    MemorySSAUpdater *MSSAU) {
-  SmallPtrSet<BasicBlock *, 16> Reachable;
+  SmallVector<bool, 16> Reachable(F.getMaxBlockNumber());
   bool Changed = markAliveBlocks(F, Reachable, DTU);
 
-  // If there are unreachable blocks in the CFG...
-  if (Reachable.size() == F.size())
-    return Changed;
-
-  assert(Reachable.size() < F.size());
-
   // Are there any blocks left to actually delete?
   SmallSetVector<BasicBlock *, 8> BlocksToRemove;
   for (BasicBlock &BB : F) {
     // Skip reachable basic blocks
-    if (Reachable.count(&BB))
+    if (Reachable[BB.getNumber()])
       continue;
     // Skip already-deleted blocks
     if (DTU && DTU->isBBPendingDeletion(&BB))
diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
index fff6a7bca26a3..aa2f842a39728 100644
--- a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -156,8 +156,8 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
     return std::nullopt;
   }
 
-  BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!LatchBr || LatchBr->isUnconditional()) {
+  CondBrInst *LatchBr = dyn_cast<CondBrInst>(Latch->getTerminator());
+  if (!LatchBr) {
     FailureReason = "latch terminator not conditional branch";
     return std::nullopt;
   }
@@ -603,7 +603,7 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
                                       BBInsertLocation);
 
-  BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
+  Instruction *PreheaderJump = Preheader->getTerminator();
   bool Increasing = LS.IndVarIncreasing;
   bool IsSignedPredicate = LS.IsSignedPredicate;
 
@@ -647,8 +647,8 @@ LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
   Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt);
   B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
 
-  BranchInst *BranchToContinuation =
-      BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+  UncondBrInst *BranchToContinuation =
+      UncondBrInst::Create(ContinuationBlock, RRI.PseudoExit);
 
   // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
   // each of the PHI nodes in the loop header.  This feeds into the initial
@@ -690,7 +690,7 @@ BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
                                              BasicBlock *OldPreheader,
                                              const char *Tag) const {
   BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
-  BranchInst::Create(LS.Header, Preheader);
+  UncondBrInst::Create(LS.Header, Preheader);
 
   LS.Header->replacePhiUsesWith(OldPreheader, Preheader);
 
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index fa089f8292654..91737b5c30533 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -708,8 +708,8 @@ countToEliminateCompares(Loop &L, unsigned MaxPeelCount, ScalarEvolution &SE,
         ComputePeelCountMinMax(MinMax);
     }
 
-    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BI || BI->isUnconditional())
+    auto *BI = dyn_cast<CondBrInst>(BB->getTerminator());
+    if (!BI)
       continue;
 
     // Ignore loop exit condition.
@@ -731,8 +731,8 @@ static bool violatesLegacyMultiExitLoopCheck(Loop *L) {
   if (!Latch)
     return true;
 
-  BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
+  CondBrInst *LatchBR = dyn_cast<CondBrInst>(Latch->getTerminator());
+  if (!LatchBR || !L->isLoopExiting(Latch))
     return true;
 
   assert((LatchBR->getSuccessor(0) == L->getHeader() ||
@@ -972,7 +972,7 @@ static void cloneLoopBlocks(
     // This is the last iteration and we definitely will go to the exit. Just
     // set both successors to InsertBot and let the branch be simplified later.
     assert(IterNumber == 0 && "Only peeling a single iteration implemented.");
-    auto *LatchTerm = cast<BranchInst>(NewLatch->getTerminator());
+    auto *LatchTerm = cast<CondBrInst>(NewLatch->getTerminator());
     LatchTerm->setSuccessor(0, InsertBot);
     LatchTerm->setSuccessor(1, InsertBot);
   } else {
@@ -1217,7 +1217,7 @@ void llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
       NewPreHeader = SplitEdge(PreHeader, Header, &DT, LI);
       SCEVExpander Expander(*SE, "loop-peel");
 
-      BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+      Instruction *PreHeaderBR = PreHeader->getTerminator();
       Value *BTCValue =
           Expander.expandCodeFor(BTC, BTC->getType(), PreHeaderBR);
       IRBuilder<> B(PreHeaderBR);
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 3e11db7dd6e6b..bf236d48f58f9 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -183,8 +183,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 // This means that rotating the loop can remove the phi.
 static bool profitableToRotateLoopExitingLatch(Loop *L) {
   BasicBlock *Header = L->getHeader();
-  BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator());
-  assert(BI && BI->isConditional() && "need header with conditional exit");
+  CondBrInst *BI = dyn_cast<CondBrInst>(Header->getTerminator());
   BasicBlock *HeaderExit = BI->getSuccessor(0);
   if (L->contains(HeaderExit))
     HeaderExit = BI->getSuccessor(1);
@@ -200,7 +199,7 @@ static bool profitableToRotateLoopExitingLatch(Loop *L) {
   return false;
 }
 
-static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
+static void updateBranchWeights(CondBrInst &PreHeaderBI, CondBrInst &LoopBI,
                                 bool HasConditionalPreHeader,
                                 bool SuccsSwapped) {
   MDNode *WeightMD = getBranchWeightMDNode(PreHeaderBI);
@@ -346,8 +345,8 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   BasicBlock *OrigHeader = L->getHeader();
   BasicBlock *OrigLatch = L->getLoopLatch();
 
-  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (!BI || BI->isUnconditional())
+  CondBrInst *BI = dyn_cast<CondBrInst>(OrigHeader->getTerminator());
+  if (!BI)
     return Rotated;
 
   // If the loop header is not one of the loop exiting blocks then
@@ -741,8 +740,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // then we fold away the cond branch to an uncond branch.  This simplifies the
   // loop in cases important for nested loops, and it also means we don't have
   // to split as many edges.
-  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  CondBrInst *PHBI = cast<CondBrInst>(OrigPreheader->getTerminator());
   const Value *Cond = PHBI->getCondition();
   const bool HasConditionalPreHeader =
       !isa<ConstantInt>(Cond) ||
@@ -787,7 +785,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // We can fold the conditional branch in the preheader, this makes things
     // simpler. The first step is to remove the extra edge to the Exit block.
     Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI->getIterator());
+    UncondBrInst *NewBI = UncondBrInst::Create(NewHeader, PHBI->getIterator());
     NewBI->setDebugLoc(PHBI->getDebugLoc());
     PHBI->eraseFromParent();
 
@@ -903,16 +901,15 @@ bool LoopRotate::simplifyLoopLatch(Loop *L) {
   if (!Latch || Latch->hasAddressTaken())
     return false;
 
-  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!Jmp || !Jmp->isUnconditional())
+  UncondBrInst *Jmp = dyn_cast<UncondBrInst>(Latch->getTerminator());
+  if (!Jmp)
     return false;
 
   BasicBlock *LastExit = Latch->getSinglePredecessor();
   if (!LastExit || !L->isLoopExiting(LastExit))
     return false;
 
-  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
-  if (!BI)
+  if (!isa<UncondBrInst, CondBrInst>(LastExit->getTerminator()))
     return false;
 
   if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 798a11af774a2..efd43c250af9a 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -381,7 +381,7 @@ static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
   // Create and insert the new backedge block.
   BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
                                            Header->getName() + ".backedge", F);
-  BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
+  UncondBrInst *BETerminator = UncondBrInst::Create(Header, BEBlock);
   BETerminator->setDebugLoc(Header->getFirstNonPHIIt()->getDebugLoc());
 
   LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
@@ -518,20 +518,19 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
   SmallVector<BasicBlock*, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
   for (BasicBlock *ExitingBlock : ExitingBlocks)
-    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
-      if (BI->isConditional()) {
-        if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
+    if (CondBrInst *BI = dyn_cast<CondBrInst>(ExitingBlock->getTerminator())) {
+      if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
 
-          LLVM_DEBUG(dbgs()
-                     << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
-                     << ExitingBlock->getName() << "\n");
+        LLVM_DEBUG(
+            dbgs() << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+                   << ExitingBlock->getName() << "\n");
 
-          BI->setCondition(ConstantInt::get(Cond->getType(),
-                                            !L->contains(BI->getSuccessor(0))));
+        BI->setCondition(ConstantInt::get(Cond->getType(),
+                                          !L->contains(BI->getSuccessor(0))));
 
-          Changed = true;
-        }
+        Changed = true;
       }
+    }
 
   // Does the loop already have a preheader?  If so, don't insert one.
   BasicBlock *Preheader = L->getLoopPreheader();
@@ -629,8 +628,9 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
   if (HasUniqueExitBlock()) {
     for (BasicBlock *ExitingBlock : ExitingBlocks) {
       if (!ExitingBlock->getSinglePredecessor()) continue;
-      BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
-      if (!BI || !BI->isConditional()) continue;
+      CondBrInst *BI = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
+      if (!BI)
+        continue;
       CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
       if (!CI || CI->getParent() != ExitingBlock) continue;
 
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index c2821f36fb4d2..6a2ccbea996bf 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -521,7 +521,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   for (auto *ExitingBlock : ExitingBlocks) {
     // The folding code is not prepared to deal with non-branch instructions
     // right now.
-    auto *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+    auto *BI = dyn_cast<CondBrInst>(ExitingBlock->getTerminator());
     if (!BI)
       continue;
 
@@ -572,12 +572,13 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   // (2b) latch is conditional and is an exiting block
   // FIXME: The implementation can be extended to work with more complicated
   // cases, e.g. loops with multiple latches.
-  BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  Instruction *LatchTerm = LatchBlock->getTerminator();
 
   // A conditional branch which exits the loop, which can be optimized to an
   // unconditional branch in the unrolled loop in some cases.
   bool LatchIsExiting = L->isLoopExiting(LatchBlock);
-  if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) {
+  if (!isa<UncondBrInst>(LatchTerm) &&
+      !(isa<CondBrInst>(LatchTerm) && LatchIsExiting)) {
     LLVM_DEBUG(
         dbgs() << "Can't unroll; a conditional latch must exit the loop");
     return LoopUnrollResult::Unmodified;
@@ -952,7 +953,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
 
   SmallVector<DominatorTree::UpdateType> DTUpdates;
   auto SetDest = [&](BasicBlock *Src, bool WillExit, bool ExitOnTrue) {
-    auto *Term = cast<BranchInst>(Src->getTerminator());
+    auto *Term = cast<CondBrInst>(Src->getTerminator());
     const unsigned Idx = ExitOnTrue ^ WillExit;
     BasicBlock *Dest = Term->getSuccessor(Idx);
     BasicBlock *DeadSucc = Term->getSuccessor(1-Idx);
@@ -961,7 +962,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     DeadSucc->removePredecessor(Src, /* KeepOneInputPHIs */ true);
 
     // Replace the conditional branch with an unconditional one.
-    auto *BI = BranchInst::Create(Dest, Term->getIterator());
+    auto *BI = UncondBrInst::Create(Dest, Term->getIterator());
     BI->setDebugLoc(Term->getDebugLoc());
     Term->eraseFromParent();
 
@@ -1064,13 +1065,12 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
 
   // Merge adjacent basic blocks, if possible.
   for (BasicBlock *Latch : Latches) {
-    BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
-    assert((Term ||
+    assert((isa<UncondBrInst, CondBrInst>(Latch->getTerminator()) ||
             (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&
            "Need a branch as terminator, except when fully unrolling with "
            "unconditional latch");
-    if (Term && Term->isUnconditional()) {
-      BasicBlock *Dest = Term->getSuccessor(0);
+    if (auto *Term = dyn_cast<UncondBrInst>(Latch->getTerminator())) {
+      BasicBlock *Dest = Term->getSuccessor();
       BasicBlock *Fold = Dest->getUniquePredecessor();
       if (MergeBlockIntoPredecessor(Dest, /*DTU=*/DTUToUse, LI,
                                     /*MSSAU=*/nullptr, /*MemDep=*/nullptr,
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 1e614bd29ee6e..1a08a3d115d9e 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -293,8 +293,7 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   BasicBlock *LatchBlock = L->getLoopLatch();
   assert(Preheader && "No preheader");
   assert(LatchBlock && "No latch block");
-  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-  assert(BI && !BI->isUnconditional());
+  CondBrInst *BI = cast<CondBrInst>(LatchBlock->getTerminator());
   bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
   BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
   bool SubLoopContinueOnTrue = SubLoop->contains(
@@ -485,10 +484,9 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
                            LastValueMap);
 
   // Update ForeBlocks successors and phi nodes
-  BranchInst *ForeTerm =
-      cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
-  assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
-  ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
+  UncondBrInst *ForeTerm =
+      cast<UncondBrInst>(ForeBlocksLast.back()->getTerminator());
+  ForeTerm->setSuccessor(SubLoopBlocksFirst[0]);
 
   if (CompletelyUnroll) {
     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
@@ -503,15 +501,14 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
 
   for (unsigned It = 1; It != Count; It++) {
     // Remap ForeBlock successors from previous iteration to this
-    BranchInst *ForeTerm =
-        cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
-    assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
-    ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
+    UncondBrInst *ForeTerm =
+        cast<UncondBrInst>(ForeBlocksLast[It - 1]->getTerminator());
+    ForeTerm->setSuccessor(ForeBlocksFirst[It]);
   }
 
   // Subloop successors and phis
-  BranchInst *SubTerm =
-      cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
+  CondBrInst *SubTerm =
+      cast<CondBrInst>(SubLoopBlocksLast.back()->getTerminator());
   SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
   SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
   SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
@@ -522,9 +519,9 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   for (unsigned It = 1; It != Count; It++) {
     // Replace the conditional branch of the previous iteration subloop with an
     // unconditional one to this one
-    BranchInst *SubTerm =
-        cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
-    BranchInst::Create(SubLoopBlocksFirst[It], SubTerm->getIterator());
+    CondBrInst *SubTerm =
+        cast<CondBrInst>(SubLoopBlocksLast[It - 1]->getTerminator());
+    UncondBrInst::Create(SubLoopBlocksFirst[It], SubTerm->getIterator());
     SubTerm->eraseFromParent();
 
     SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
@@ -535,9 +532,9 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   }
 
   // Aft blocks successors and phis
-  BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+  CondBrInst *AftTerm = cast<CondBrInst>(AftBlocksLast.back()->getTerminator());
   if (CompletelyUnroll) {
-    BranchInst::Create(LoopExit, AftTerm->getIterator());
+    UncondBrInst::Create(LoopExit, AftTerm->getIterator());
     AftTerm->eraseFromParent();
   } else {
     AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
@@ -550,9 +547,9 @@ llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
   for (unsigned It = 1; It != Count; It++) {
     // Replace the conditional branch of the previous iteration subloop with an
     // unconditional one to this one
-    BranchInst *AftTerm =
-        cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
-    BranchInst::Create(AftBlocksFirst[It], AftTerm->getIterator());
+    CondBrInst *AftTerm =
+        cast<CondBrInst>(AftBlocksLast[It - 1]->getTerminator());
+    UncondBrInst::Create(AftBlocksFirst[It], AftTerm->getIterator());
     AftTerm->eraseFromParent();
 
     AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 0cfd4a59bb4e0..68751cf95bc7a 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -376,7 +376,7 @@ static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
     MDBuilder MDB(B.getContext());
     BranchWeights = MDB.createBranchWeights(1, Count - 1);
   }
-  BranchInst *RemainderLoopGuard =
+  CondBrInst *RemainderLoopGuard =
       B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit, BranchWeights);
   if (!OriginalLoopProb.isUnknown()) {
     setBranchProbability(RemainderLoopGuard,
@@ -454,7 +454,7 @@ static Loop *CloneLoopBlocks(Loop *L, Value *NewIter,
       // Subtle: NewIter can be 0 if we wrapped when computing the trip count,
       // thus we must compare the post-increment (wrapping) value.
       BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
-      BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
+      CondBrInst *LatchBR = cast<CondBrInst>(NewBB->getTerminator());
       IRBuilder<> Builder(LatchBR);
       PHINode *NewIdx =
           PHINode::Create(NewIter->getType(), 2, suffix + ".iter");
@@ -484,7 +484,7 @@ static Loop *CloneLoopBlocks(Loop *L, Value *NewIter,
         MDBuilder MDB(Builder.getContext());
         BranchWeights = MDB.createBranchWeights(BackEdgeWeight, ExitWeight);
       }
-      BranchInst *RemainderLoopLatch =
+      CondBrInst *RemainderLoopLatch =
           Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot, BranchWeights);
       if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
         // Compute the total frequency of the original loop body from the
@@ -693,9 +693,9 @@ bool llvm::UnrollRuntimeLoopRemainder(
   BasicBlock *Latch = L->getLoopLatch();
   BasicBlock *Header = L->getHeader();
 
-  BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+  CondBrInst *LatchBR = dyn_cast<CondBrInst>(Latch->getTerminator());
 
-  if (!LatchBR || LatchBR->isUnconditional()) {
+  if (!LatchBR) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
     LLVM_DEBUG(
         dbgs()
@@ -770,7 +770,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
   }
 
   BasicBlock *PreHeader = L->getLoopPreheader();
-  BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+  Instruction *PreHeaderBR = PreHeader->getTerminator();
   SCEVExpander Expander(*SE, "loop-unroll");
   if (!AllowExpensiveTripCount &&
       Expander.isHighCostExpansion(TripCountSC, L, SCEVExpansionBudget, TTI,
@@ -861,7 +861,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
   // in epilog case and around prolog remainder loop in prolog case.
   // Compute the number of extra iterations required, which is:
   //  extra iterations = run-time trip count % loop unroll factor
-  PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+  PreHeaderBR = PreHeader->getTerminator();
   IRBuilder<> B(PreHeaderBR);
   Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
                                             PreHeaderBR);
@@ -902,7 +902,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     MDBuilder MDB(B.getContext());
     BranchWeights = MDB.createBranchWeights(EpilogHeaderWeights);
   }
-  BranchInst *UnrollingLoopGuard =
+  CondBrInst *UnrollingLoopGuard =
       B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop, BranchWeights);
   if (!OriginalLoopProb.isUnknown() && UseEpilogRemainder) {
     // The original loop's first iteration always happens.  Compute the
@@ -1050,7 +1050,7 @@ bool llvm::UnrollRuntimeLoopRemainder(
     // thus we must compare the post-increment (wrapping) value.
     IRBuilder<> B2(NewPreHeader->getTerminator());
     Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
-    BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+    CondBrInst *LatchBR = cast<CondBrInst>(Latch->getTerminator());
     PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter");
     NewIdx->insertBefore(Header->getFirstNonPHIIt());
     B2.SetInsertPoint(LatchBR);
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 2d7cb2a035957..c68f478339090 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -730,14 +730,12 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   // Update the CFG and domtree.  We chose to special case a couple of
   // of common cases for code quality and test readability reasons.
   [&]() -> void {
-    if (auto *BI = dyn_cast<BranchInst>(Latch->getTerminator())) {
-      if (!BI->isConditional()) {
-        DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
-        (void)changeToUnreachable(BI, /*PreserveLCSSA*/ true, &DTU,
-                                  MSSAU.get());
-        return;
-      }
-
+    if (auto *BI = dyn_cast<UncondBrInst>(Latch->getTerminator())) {
+      DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Eager);
+      (void)changeToUnreachable(BI, /*PreserveLCSSA*/ true, &DTU, MSSAU.get());
+      return;
+    }
+    if (auto *BI = dyn_cast<CondBrInst>(Latch->getTerminator())) {
       // Conditional latch/exit - note that latch can be shared by inner
       // and outer loop so the other target doesn't need to an exit
       if (L->isLoopExiting(Latch)) {
@@ -793,13 +791,13 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 /// Checks if \p L has an exiting latch branch.  There may also be other
 /// exiting blocks.  Returns branch instruction terminating the loop
 /// latch if above check is successful, nullptr otherwise.
-static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
+static CondBrInst *getExpectedExitLoopLatchBranch(Loop *L) {
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch)
     return nullptr;
 
-  BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
+  CondBrInst *LatchBR = dyn_cast<CondBrInst>(Latch->getTerminator());
+  if (!LatchBR || !L->isLoopExiting(Latch))
     return nullptr;
 
   assert((LatchBR->getSuccessor(0) == L->getHeader() ||
@@ -827,7 +825,7 @@ static std::optional<unsigned> estimateLoopTripCount(Loop *L) {
   // ignoring other exiting blocks.  This can overestimate the trip count
   // if we exit through another exit, but can never underestimate it.
   // TODO: incorporate information from other exits
-  BranchInst *ExitingBranch = getExpectedExitLoopLatchBranch(L);
+  CondBrInst *ExitingBranch = getExpectedExitLoopLatchBranch(L);
   if (!ExitingBranch) {
     LLVM_DEBUG(dbgs() << "estimateLoopTripCount: Failed to find exiting "
                       << "latch branch of required form in " << DbgLoop(L)
@@ -895,7 +893,7 @@ llvm::getLoopEstimatedTripCount(Loop *L,
   // - To satsify the condition for the outer loop, the latch must have a third
   //   successor that is an exit for the outer loop.  But that violates the
   //   condition for both loops.
-  BranchInst *ExitingBranch = getExpectedExitLoopLatchBranch(L);
+  CondBrInst *ExitingBranch = getExpectedExitLoopLatchBranch(L);
   if (!ExitingBranch)
     return std::nullopt;
 
@@ -952,7 +950,7 @@ bool llvm::setLoopEstimatedTripCount(
   //
   // FIXME: See comments in getLoopEstimatedTripCount for why this is required
   // here regardless of EstimatedLoopInvocationWeight.
-  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  CondBrInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
   if (!LatchBranch)
     return false;
 
@@ -990,7 +988,7 @@ bool llvm::setLoopEstimatedTripCount(
 }
 
 BranchProbability llvm::getLoopProbability(Loop *L) {
-  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  CondBrInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
   if (!LatchBranch)
     return BranchProbability::getUnknown();
   bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
@@ -998,17 +996,16 @@ BranchProbability llvm::getLoopProbability(Loop *L) {
 }
 
 bool llvm::setLoopProbability(Loop *L, BranchProbability P) {
-  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  CondBrInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
   if (!LatchBranch)
     return false;
   bool FirstTargetIsLoop = LatchBranch->getSuccessor(0) == L->getHeader();
-  return setBranchProbability(LatchBranch, P, FirstTargetIsLoop);
+  setBranchProbability(LatchBranch, P, FirstTargetIsLoop);
+  return true;
 }
 
-BranchProbability llvm::getBranchProbability(BranchInst *B,
+BranchProbability llvm::getBranchProbability(CondBrInst *B,
                                              bool ForFirstTarget) {
-  if (B->getNumSuccessors() != 2)
-    return BranchProbability::getUnknown();
   uint64_t Weight0, Weight1;
   if (!extractBranchWeights(*B, Weight0, Weight1))
     return BranchProbability::getUnknown();
@@ -1054,17 +1051,14 @@ BranchProbability llvm::getBranchProbability(BasicBlock *Src, BasicBlock *Dst) {
   return BranchProbability(Numerator, Total);
 }
 
-bool llvm::setBranchProbability(BranchInst *B, BranchProbability P,
+void llvm::setBranchProbability(CondBrInst *B, BranchProbability P,
                                 bool ForFirstTarget) {
-  if (B->getNumSuccessors() != 2)
-    return false;
   BranchProbability Prob0 = P;
   BranchProbability Prob1 = P.getCompl();
   if (!ForFirstTarget)
     std::swap(Prob0, Prob1);
   setBranchWeights(*B, {Prob0.getNumerator(), Prob1.getNumerator()},
                    /*IsExpected=*/false);
-  return true;
 }
 
 bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
@@ -2244,8 +2238,8 @@ Value *llvm::addDiffRuntimeChecks(
 std::optional<IVConditionInfo>
 llvm::hasPartialIVCondition(const Loop &L, unsigned MSSAThreshold,
                             const MemorySSA &MSSA, AAResults &AA) {
-  auto *TI = dyn_cast<BranchInst>(L.getHeader()->getTerminator());
-  if (!TI || !TI->isConditional())
+  auto *TI = dyn_cast<CondBrInst>(L.getHeader()->getTerminator());
+  if (!TI)
     return {};
 
   auto *CondI = dyn_cast<Instruction>(TI->getCondition());
diff --git a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
index df3a2a94b7ddb..d33025adb520c 100644
--- a/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
+++ b/llvm/lib/Transforms/Utils/LowerGlobalDtors.cpp
@@ -210,7 +210,7 @@ static bool runImpl(Module &M) {
       Value *Res = CallInst::Create(AtExit, Args, "call", EntryBB);
       Value *Cmp = new ICmpInst(EntryBB, ICmpInst::ICMP_NE, Res,
                                 Constant::getNullValue(Res->getType()));
-      BranchInst::Create(FailBB, RetBB, Cmp, EntryBB);
+      CondBrInst::Create(Cmp, FailBB, RetBB, EntryBB);
 
       // If `__cxa_atexit` hits out-of-memory, trap, so that we don't misbehave.
       // This should be very rare, because if the process is running out of
diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index cecb6629773f1..488745886ec37 100644
--- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -60,7 +60,7 @@ static bool runImpl(Function &F) {
       II->replaceAllUsesWith(NewCall);
 
       // Insert an unconditional branch to the normal destination.
-      BranchInst::Create(II->getNormalDest(), II->getIterator());
+      UncondBrInst::Create(II->getNormalDest(), II->getIterator());
 
       // Remove any PHI node entries from the exception destination.
       II->getUnwindDest()->removePredecessor(&BB);
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 452de63ca76b8..c934940264c36 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -204,7 +204,7 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
          "At least one of the loops must be generated");
 
   BasicBlock *MainLoopBB = nullptr;
-  BranchInst *MainLoopBr = nullptr;
+  CondBrInst *MainLoopBr = nullptr;
 
   // Construct the main loop unless we statically known that it is not taken.
   if (MayTakeMainLoop) {
@@ -313,7 +313,7 @@ insertLoopExpansion(Instruction *InsertBefore, Value *Len,
     LEI.ResidualLoopIP = cast<Instruction>(ResNewIndex);
 
     // Stay in the residual loop until all ResidualUnits are handled.
-    BranchInst *BR = ResBuilder.CreateCondBr(
+    CondBrInst *BR = ResBuilder.CreateCondBr(
         ResBuilder.CreateICmpULT(ResNewIndex, ResidualUnits), ResidualLoopBB,
         PostLoopBB);
 
@@ -822,9 +822,9 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
       ResidualLoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
 
       // How to get to the residual:
-      BranchInst *BrInst =
-          BranchInst::Create(IntermediateBB, ResidualLoopBB,
-                             SkipResidualCondition, ThenTerm->getIterator());
+      CondBrInst *BrInst =
+          CondBrInst::Create(SkipResidualCondition, IntermediateBB,
+                             ResidualLoopBB, ThenTerm->getIterator());
       BrInst->setDebugLoc(DbgLoc);
       ThenTerm->eraseFromParent();
 
@@ -852,8 +852,8 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
 
     // How to get to the main loop:
     Instruction *PredBBTerm = PredBB->getTerminator();
-    BranchInst *BrInst = BranchInst::Create(
-        ExitBB, MainLoopBB, SkipMainCondition, PredBBTerm->getIterator());
+    CondBrInst *BrInst = CondBrInst::Create(
+        SkipMainCondition, ExitBB, MainLoopBB, PredBBTerm->getIterator());
     BrInst->setDebugLoc(DbgLoc);
     PredBBTerm->eraseFromParent();
   }
@@ -891,8 +891,8 @@ static void createMemMoveLoopUnknownSize(Instruction *InsertBefore,
         MainLoopBB);
 
     // getting in or skipping the main loop
-    BranchInst *BrInst =
-        BranchInst::Create(SuccessorBB, MainLoopBB, SkipMainCondition,
+    CondBrInst *BrInst =
+        CondBrInst::Create(SkipMainCondition, SuccessorBB, MainLoopBB,
                            CopyFwdBBTerm->getIterator());
     BrInst->setDebugLoc(DbgLoc);
     CopyFwdBBTerm->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 6acbce884fcc0..9313f313cb02d 100644
--- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -191,7 +191,7 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
 
   // Make the conditional branch...
   BasicBlock *Succ = Leaf.BB;
-  BranchInst::Create(Succ, Default, Comp, NewLeaf);
+  CondBrInst::Create(Comp, Succ, Default, NewLeaf);
 
   // Update the PHI incoming value/block for the default.
   for (auto &I : Default->phis()) {
@@ -297,7 +297,7 @@ BasicBlock *SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
   F->insert(++OrigBlock->getIterator(), NewNode);
   Comp->insertInto(NewNode, NewNode->end());
 
-  BranchInst::Create(LBranch, RBranch, Comp, NewNode);
+  CondBrInst::Create(Comp, LBranch, RBranch, NewNode);
   return NewNode;
 }
 
@@ -377,7 +377,7 @@ void ProcessSwitchInst(SwitchInst *SI,
 
   // If there is only the default destination, just branch.
   if (Cases.empty()) {
-    BranchInst::Create(Default, OrigBlock);
+    UncondBrInst::Create(Default, OrigBlock);
     // Remove all the references from Default's PHIs to OrigBlock, but one.
     FixPhis(Default, OrigBlock, OrigBlock, UnsignedMax);
     SI->eraseFromParent();
@@ -492,7 +492,7 @@ void ProcessSwitchInst(SwitchInst *SI,
 
     // If there are no cases left, just branch.
     if (Cases.empty()) {
-      BranchInst::Create(Default, OrigBlock);
+      UncondBrInst::Create(Default, OrigBlock);
       SI->eraseFromParent();
       // As all the cases have been replaced with a single branch, only keep
       // one entry in the PHI nodes.
@@ -521,7 +521,7 @@ void ProcessSwitchInst(SwitchInst *SI,
     FixPhis(Default, OrigBlock, nullptr, UnsignedMax);
 
   // Branch to our shiny new if-then stuff...
-  BranchInst::Create(SwitchBlock, OrigBlock);
+  UncondBrInst::Create(SwitchBlock, OrigBlock);
 
   // We are now done with the switch instruction, delete it.
   BasicBlock *OldDefault = SI->getDefaultDest();
diff --git a/llvm/lib/Transforms/Utils/MatrixUtils.cpp b/llvm/lib/Transforms/Utils/MatrixUtils.cpp
index cc4326dd1a071..b04003b5090fa 100644
--- a/llvm/lib/Transforms/Utils/MatrixUtils.cpp
+++ b/llvm/lib/Transforms/Utils/MatrixUtils.cpp
@@ -40,8 +40,8 @@ BasicBlock *TileInfo::CreateLoop(BasicBlock *Preheader, BasicBlock *Exit,
                                          Header->getParent(), Exit);
 
   Type *I32Ty = Type::getInt64Ty(Ctx);
-  BranchInst::Create(Body, Header);
-  BranchInst::Create(Latch, Body);
+  UncondBrInst::Create(Body, Header);
+  UncondBrInst::Create(Latch, Body);
   PHINode *IV =
       PHINode::Create(I32Ty, 2, Name + ".iv", Header->getTerminator()->getIterator());
   IV->addIncoming(ConstantInt::get(I32Ty, 0), Preheader);
@@ -49,7 +49,7 @@ BasicBlock *TileInfo::CreateLoop(BasicBlock *Preheader, BasicBlock *Exit,
   B.SetInsertPoint(Latch);
   Value *Inc = B.CreateAdd(IV, Step, Name + ".step");
   Value *Cond = B.CreateICmpNE(Inc, Bound, Name + ".cond");
-  auto *BR = BranchInst::Create(Header, Exit, Cond, Latch);
+  auto *BR = B.CreateCondBr(Cond, Header, Exit);
   if (!ProfcheckDisableMetadataFixes) {
     assert(Step->getZExtValue() != 0 &&
            "Expected a non-zero step size. This is chosen by the pass and "
@@ -60,8 +60,8 @@ BasicBlock *TileInfo::CreateLoop(BasicBlock *Preheader, BasicBlock *Exit,
   }
   IV->addIncoming(Inc, Latch);
 
-  BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
-  BasicBlock *Tmp = PreheaderBr->getSuccessor(0);
+  UncondBrInst *PreheaderBr = cast<UncondBrInst>(Preheader->getTerminator());
+  BasicBlock *Tmp = PreheaderBr->getSuccessor();
   PreheaderBr->setSuccessor(0, Header);
   DTU.applyUpdatesPermissive({
       {DominatorTree::Delete, Preheader, Tmp},
diff --git a/llvm/lib/Transforms/Utils/MisExpect.cpp b/llvm/lib/Transforms/Utils/MisExpect.cpp
index 1585e9e509f89..3adecdcaef0ab 100644
--- a/llvm/lib/Transforms/Utils/MisExpect.cpp
+++ b/llvm/lib/Transforms/Utils/MisExpect.cpp
@@ -73,7 +73,7 @@ static uint32_t getMisExpectTolerance(const LLVMContext &Ctx) {
 static const Instruction *getInstCondition(const Instruction *I) {
   assert(I != nullptr && "MisExpect target Instruction cannot be nullptr");
   const Instruction *Ret = nullptr;
-  if (auto *B = dyn_cast<BranchInst>(I)) {
+  if (auto *B = dyn_cast<CondBrInst>(I)) {
     Ret = dyn_cast<Instruction>(B->getCondition());
   }
   // TODO: Find a way to resolve condition location for switches
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index a5fff94b60d18..3d3c70522fb55 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -220,7 +220,7 @@ class PredicateInfoBuilder {
 
   void processAssume(AssumeInst *, BasicBlock *,
                      SmallVectorImpl<Value *> &OpsToRename);
-  void processBranch(BranchInst *, BasicBlock *,
+  void processBranch(CondBrInst *, BasicBlock *,
                      SmallVectorImpl<Value *> &OpsToRename);
   void processSwitch(SwitchInst *, BasicBlock *,
                      SmallVectorImpl<Value *> &OpsToRename);
@@ -409,7 +409,7 @@ void PredicateInfoBuilder::processAssume(
 // Process a block terminating branch, and place relevant operations to be
 // renamed into OpsToRename.
 void PredicateInfoBuilder::processBranch(
-    BranchInst *BI, BasicBlock *BranchBB,
+    CondBrInst *BI, BasicBlock *BranchBB,
     SmallVectorImpl<Value *> &OpsToRename) {
   BasicBlock *FirstBB = BI->getSuccessor(0);
   BasicBlock *SecondBB = BI->getSuccessor(1);
@@ -490,9 +490,7 @@ void PredicateInfoBuilder::buildPredicateInfo() {
     if (!DT.isReachableFromEntry(&BB))
       continue;
 
-    if (auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
-      if (!BI->isConditional())
-        continue;
+    if (auto *BI = dyn_cast<CondBrInst>(BB.getTerminator())) {
       // Can't insert conditional information if they all go to the same place.
       if (BI->getSuccessor(0) == BI->getSuccessor(1))
         continue;
diff --git a/llvm/lib/Transforms/Utils/ProfileVerify.cpp b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
index ce7bcff9c4e0b..89d2656e60d91 100644
--- a/llvm/lib/Transforms/Utils/ProfileVerify.cpp
+++ b/llvm/lib/Transforms/Utils/ProfileVerify.cpp
@@ -58,7 +58,7 @@ class ProfileInjector {
     if (succ_size(&BB) < 2)
       return nullptr;
     auto *Term = BB.getTerminator();
-    return (isa<BranchInst>(Term) || isa<SwitchInst>(Term) ||
+    return (isa<CondBrInst>(Term) || isa<SwitchInst>(Term) ||
             isa<IndirectBrInst>(Term) || isa<CallBrInst>(Term))
                ? Term
                : nullptr;
diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index fd315c14df866..9ba1002533997 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -393,8 +393,7 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU,
 
   // SCCP can only determine non-feasible edges for br, switch and indirectbr.
   Instruction *TI = BB->getTerminator();
-  assert((isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
-          isa<IndirectBrInst>(TI)) &&
+  assert((isa<UncondBrInst, CondBrInst, SwitchInst, IndirectBrInst>(TI)) &&
          "Terminator must be a br, switch or indirectbr");
 
   if (FeasibleSuccessors.size() == 0) {
@@ -426,7 +425,7 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU,
       Updates.push_back({DominatorTree::Delete, BB, Succ});
     }
 
-    Instruction *BI = BranchInst::Create(OnlyFeasibleSuccessor, BB);
+    Instruction *BI = UncondBrInst::Create(OnlyFeasibleSuccessor, BB);
     BI->setDebugLoc(TI->getDebugLoc());
     TI->eraseFromParent();
     DTU.applyUpdatesPermissive(Updates);
@@ -1251,12 +1250,12 @@ bool SCCPInstVisitor::markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
 void SCCPInstVisitor::getFeasibleSuccessors(Instruction &TI,
                                             SmallVectorImpl<bool> &Succs) {
   Succs.resize(TI.getNumSuccessors());
-  if (auto *BI = dyn_cast<BranchInst>(&TI)) {
-    if (BI->isUnconditional()) {
-      Succs[0] = true;
-      return;
-    }
+  if (isa<UncondBrInst>(TI)) {
+    Succs[0] = true;
+    return;
+  }
 
+  if (auto *BI = dyn_cast<CondBrInst>(&TI)) {
     const ValueLatticeElement &BCValue = getValueState(BI->getCondition());
     ConstantInt *CI = getConstantInt(BCValue, BI->getCondition()->getType());
     if (!CI) {
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 809856fe9f1fd..e0887a61812ee 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -297,7 +297,7 @@ class SimplifyCFGOpt {
   bool simplifyIndirectBr(IndirectBrInst *IBI);
   bool simplifyUncondBranch(UncondBrInst *BI, IRBuilder<> &Builder);
   bool simplifyCondBranch(CondBrInst *BI, IRBuilder<> &Builder);
-  bool foldCondBranchOnValueKnownInPredecessor(BranchInst *BI);
+  bool foldCondBranchOnValueKnownInPredecessor(CondBrInst *BI);
 
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
@@ -309,11 +309,11 @@ class SimplifyCFGOpt {
       Instruction *TI, Instruction *I1,
       SmallVectorImpl<Instruction *> &OtherSuccTIs,
       ArrayRef<BasicBlock *> UniqueSuccessors);
-  bool speculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
+  bool speculativelyExecuteBB(CondBrInst *BI, BasicBlock *ThenBB);
   bool simplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
                                   uint32_t TrueWeight, uint32_t FalseWeight);
-  bool simplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+  bool simplifyBranchOnICmpChain(CondBrInst *BI, IRBuilder<> &Builder,
                                  const DataLayout &DL);
   bool simplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select);
   bool simplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI);
@@ -480,8 +480,8 @@ static bool dominatesMergePoint(
   // If this instruction is defined in a block that contains an unconditional
   // branch to BB, then it must be in the 'conditional' part of the "if
   // statement".  If not, it definitely dominates the region.
-  BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator());
-  if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB)
+  UncondBrInst *BI = dyn_cast<UncondBrInst>(PBB->getTerminator());
+  if (!BI || BI->getSuccessor() != BB)
     return true;
 
   // If we have seen this instruction before, don't count it again.
@@ -849,9 +849,8 @@ static void eraseTerminatorAndDCECond(Instruction *TI,
   Instruction *Cond = nullptr;
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cond = dyn_cast<Instruction>(SI->getCondition());
-  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
-    if (BI->isConditional())
-      Cond = dyn_cast<Instruction>(BI->getCondition());
+  } else if (CondBrInst *BI = dyn_cast<CondBrInst>(TI)) {
+    Cond = dyn_cast<Instruction>(BI->getCondition());
   } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) {
     Cond = dyn_cast<Instruction>(IBI->getAddress());
   }
@@ -870,8 +869,8 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
     // predecessors unless there is only one predecessor.
     if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors()))
       CV = SI->getCondition();
-  } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
-    if (BI->isConditional() && BI->getCondition()->hasOneUse()) {
+  } else if (CondBrInst *BI = dyn_cast<CondBrInst>(TI))
+    if (BI->getCondition()->hasOneUse()) {
       if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
         if (ICI->isEquality() && getConstantInt(ICI->getOperand(1), DL))
           CV = ICI->getOperand(0);
@@ -906,7 +905,7 @@ BasicBlock *SimplifyCFGOpt::getValueEqualityComparisonCases(
     return SI->getDefaultDest();
   }
 
-  BranchInst *BI = cast<BranchInst>(TI);
+  CondBrInst *BI = cast<CondBrInst>(TI);
   Value *Cond = BI->getCondition();
   ICmpInst::Predicate Pred;
   ConstantInt *C;
@@ -1004,7 +1003,7 @@ bool SimplifyCFGOpt::simplifyEqualityComparisonWithOnlyPredecessor(
     if (!valuesOverlap(PredCases, ThisCases))
       return false;
 
-    if (isa<BranchInst>(TI)) {
+    if (isa<CondBrInst>(TI)) {
       // Okay, one of the successors of this condbr is dead.  Convert it to a
       // uncond br.
       assert(ThisCases.size() == 1 && "Branch can only have one case!");
@@ -1153,7 +1152,7 @@ static void getBranchWeights(Instruction *TI,
   // If TI is a conditional eq, the default case is the false case,
   // and the corresponding branch-weight data is at index 2. We swap the
   // default weight to be the first entry.
-  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+  if (CondBrInst *BI = dyn_cast<CondBrInst>(TI)) {
     assert(Weights.size() == 2);
     auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
     if (!ICI)
@@ -1436,7 +1435,7 @@ bool SimplifyCFGOpt::performValueComparisonIntoPredecessorFolding(
         // or it won't matter if it's hot. :)
         InfLoopBlock =
             BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
-        BranchInst::Create(InfLoopBlock, InfLoopBlock);
+        UncondBrInst::Create(InfLoopBlock, InfLoopBlock);
         if (DTU)
           Updates.push_back(
               {DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
@@ -1738,12 +1737,12 @@ static bool areIdenticalUpToCommutativity(const Instruction *I1,
 ///                                         will be speculated.
 /// \param Invert indicates if speculates FalseBB. Only used in triangle CFG.
 static void hoistConditionalLoadsStores(
-    BranchInst *BI,
+    CondBrInst *BI,
     SmallVectorImpl<Instruction *> &SpeculatedConditionalLoadsStores,
     std::optional<bool> Invert, Instruction *Sel) {
   auto &Context = BI->getParent()->getContext();
   auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
-  auto *Cond = BI->getOperand(0);
+  auto *Cond = BI->getCondition();
   // Construct the condition if needed.
   BasicBlock *BB = BI->getParent();
   Value *Mask = nullptr;
@@ -2053,7 +2052,7 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
     SmallVectorImpl<Instruction *> &OtherSuccTIs,
     ArrayRef<BasicBlock *> UniqueSuccessors) {
 
-  auto *BI = dyn_cast<BranchInst>(TI);
+  auto *BI = dyn_cast<CondBrInst>(TI);
 
   bool Changed = false;
   BasicBlock *TIParent = TI->getParent();
@@ -2437,8 +2436,8 @@ static bool sinkCommonCodeFromPredecessors(BasicBlock *BB,
   SmallVector<BasicBlock*,4> UnconditionalPreds;
   bool HaveNonUnconditionalPredecessors = false;
   for (auto *PredBB : predecessors(BB)) {
-    auto *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
-    if (PredBr && PredBr->isUnconditional())
+    auto *PredBr = dyn_cast<UncondBrInst>(PredBB->getTerminator());
+    if (PredBr)
       UnconditionalPreds.push_back(PredBB);
     else
       HaveNonUnconditionalPredecessors = true;
@@ -2906,7 +2905,7 @@ static void mergeCompatibleInvokesImpl(ArrayRef<InvokeInst *> Invokes,
     // to the block with the merged `invoke`.
     for (BasicBlock *OrigSuccBB : successors(II->getParent()))
       OrigSuccBB->removePredecessor(II->getParent());
-    auto *BI = BranchInst::Create(MergedInvoke->getParent(), II->getParent());
+    auto *BI = UncondBrInst::Create(MergedInvoke->getParent(), II->getParent());
     // The unconditional branch is part of the replacement for the original
     // invoke, so should use its DebugLoc.
     BI->setDebugLoc(II->getDebugLoc());
@@ -3145,7 +3144,7 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
   return HaveRewritablePHIs;
 }
 
-static bool isProfitableToSpeculate(const BranchInst *BI,
+static bool isProfitableToSpeculate(const CondBrInst *BI,
                                     std::optional<bool> Invert,
                                     const TargetTransformInfo &TTI) {
   // If the branch is non-unpredictable, and is predicted to *not* branch to
@@ -3204,7 +3203,7 @@ static bool isProfitableToSpeculate(const BranchInst *BI,
 /// \endcode
 ///
 /// \returns true if the conditional block is removed.
-bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
+bool SimplifyCFGOpt::speculativelyExecuteBB(CondBrInst *BI,
                                             BasicBlock *ThenBB) {
   if (!Options.SpeculateBlocks)
     return false;
@@ -3509,8 +3508,8 @@ static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From,
     return nullptr;
 
   // We know the value if the From block branches on it.
-  auto *BI = dyn_cast<BranchInst>(From->getTerminator());
-  if (BI && BI->isConditional() && BI->getCondition() == V &&
+  auto *BI = dyn_cast<CondBrInst>(From->getTerminator());
+  if (BI && BI->getCondition() == V &&
       BI->getSuccessor(0) != BI->getSuccessor(1))
     return BI->getSuccessor(0) == To ? ConstantInt::getTrue(BI->getContext())
                                      : ConstantInt::getFalse(BI->getContext());
@@ -3522,7 +3521,7 @@ static ConstantInt *getKnownValueOnEdge(Value *V, BasicBlock *From,
 /// value in predecessors (e.g. a phi node in the current block), thread edges
 /// from the predecessor to their ultimate destination.
 static std::optional<bool>
-foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
+foldCondBranchOnValueKnownInPredecessorImpl(CondBrInst *BI, DomTreeUpdater *DTU,
                                             const DataLayout &DL,
                                             AssumptionCache *AC) {
   SmallMapVector<ConstantInt *, SmallSetVector<BasicBlock *, 2>, 2> KnownValues;
@@ -3680,7 +3679,7 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
     InsertPt->cloneDebugInfoFrom(BI);
 
     BB->removePredecessor(EdgeBB);
-    BranchInst *EdgeBI = cast<BranchInst>(EdgeBB->getTerminator());
+    UncondBrInst *EdgeBI = cast<UncondBrInst>(EdgeBB->getTerminator());
     EdgeBI->setSuccessor(0, RealDest);
     EdgeBI->setDebugLoc(BI->getDebugLoc());
 
@@ -3704,7 +3703,7 @@ foldCondBranchOnValueKnownInPredecessorImpl(BranchInst *BI, DomTreeUpdater *DTU,
   return false;
 }
 
-bool SimplifyCFGOpt::foldCondBranchOnValueKnownInPredecessor(BranchInst *BI) {
+bool SimplifyCFGOpt::foldCondBranchOnValueKnownInPredecessor(CondBrInst *BI) {
   // Note: If BB is a loop header then there is a risk that threading introduces
   // a non-canonical loop by moving a back edge. So we avoid this optimization
   // for loop headers if NeedCanonicalLoop is set.
@@ -3737,7 +3736,7 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   BasicBlock *BB = PN->getParent();
 
   BasicBlock *IfTrue, *IfFalse;
-  BranchInst *DomBI = GetIfCondition(BB, IfTrue, IfFalse);
+  CondBrInst *DomBI = GetIfCondition(BB, IfTrue, IfFalse);
   if (!DomBI)
     return false;
   Value *IfCond = DomBI->getCondition();
@@ -3747,10 +3746,10 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 
   BasicBlock *DomBlock = DomBI->getParent();
   SmallVector<BasicBlock *, 2> IfBlocks;
-  llvm::copy_if(
-      PN->blocks(), std::back_inserter(IfBlocks), [](BasicBlock *IfBlock) {
-        return cast<BranchInst>(IfBlock->getTerminator())->isUnconditional();
-      });
+  llvm::copy_if(PN->blocks(), std::back_inserter(IfBlocks),
+                [](BasicBlock *IfBlock) {
+                  return isa<UncondBrInst>(IfBlock->getTerminator());
+                });
   assert((IfBlocks.size() == 1 || IfBlocks.size() == 2) &&
          "Will have either one or two blocks to speculate.");
 
@@ -3832,28 +3831,24 @@ static bool foldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   if (!PN)
     return true;
 
-  // Return true if at least one of these is a 'not', and another is either
-  // a 'not' too, or a constant.
-  auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) {
-    if (!match(V0, m_Not(m_Value())))
-      std::swap(V0, V1);
-    auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant());
-    return match(V0, m_Not(m_Value())) && match(V1, Invertible);
-  };
-
   // Don't fold i1 branches on PHIs which contain binary operators or
-  // (possibly inverted) select form of or/ands,  unless one of
-  // the incoming values is an 'not' and another one is freely invertible.
-  // These can often be turned into switches and other things.
-  auto IsBinOpOrAnd = [](Value *V) {
-    return match(
-        V, m_CombineOr(m_BinOp(), m_c_Select(m_ImmConstant(), m_Value())));
+  // (possibly inverted) select form of or/ands if their parameters are
+  // an equality test.
+  auto IsBinOpOrAndEq = [](Value *V) {
+    CmpPredicate Pred;
+    if (match(V, m_CombineOr(
+                     m_CombineOr(
+                         m_BinOp(m_Cmp(Pred, m_Value(), m_Value()), m_Value()),
+                         m_BinOp(m_Value(), m_Cmp(Pred, m_Value(), m_Value()))),
+                     m_c_Select(m_ImmConstant(),
+                                m_Cmp(Pred, m_Value(), m_Value()))))) {
+      return CmpInst::isEquality(Pred);
+    }
+    return false;
   };
   if (PN->getType()->isIntegerTy(1) &&
-      (IsBinOpOrAnd(PN->getIncomingValue(0)) ||
-       IsBinOpOrAnd(PN->getIncomingValue(1)) || IsBinOpOrAnd(IfCond)) &&
-      !CanHoistNotFromBothValues(PN->getIncomingValue(0),
-                                 PN->getIncomingValue(1)))
+      (IsBinOpOrAndEq(PN->getIncomingValue(0)) ||
+       IsBinOpOrAndEq(PN->getIncomingValue(1)) || IsBinOpOrAndEq(IfCond)))
     return Changed;
 
   // If all PHI nodes are promotable, check to make sure that all instructions
@@ -3937,7 +3932,7 @@ static Value *createLogicalOp(IRBuilderBase &Builder,
 /// Return true if either PBI or BI has branch weight available, and store
 /// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
 /// not have branch weight, use 1:1 as its weight.
-static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
+static bool extractPredSuccWeights(CondBrInst *PBI, CondBrInst *BI,
                                    uint64_t &PredTrueWeight,
                                    uint64_t &PredFalseWeight,
                                    uint64_t &SuccTrueWeight,
@@ -3961,10 +3956,9 @@ static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
 /// that joins the branches' conditions to arrive at the common destination if
 /// that would be profitable.
 static std::optional<std::tuple<BasicBlock *, Instruction::BinaryOps, bool>>
-shouldFoldCondBranchesToCommonDestination(BranchInst *BI, BranchInst *PBI,
+shouldFoldCondBranchesToCommonDestination(CondBrInst *BI, CondBrInst *PBI,
                                           const TargetTransformInfo *TTI) {
-  assert(BI && PBI && BI->isConditional() && PBI->isConditional() &&
-         "Both blocks must end with a conditional branches.");
+  assert(BI && PBI && "Both blocks must end with a conditional branches.");
   assert(is_contained(predecessors(BI->getParent()), PBI->getParent()) &&
          "PredBB must be a predecessor of BB.");
 
@@ -4000,7 +3994,7 @@ shouldFoldCondBranchesToCommonDestination(BranchInst *BI, BranchInst *PBI,
   return std::nullopt;
 }
 
-static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
+static bool performBranchToCommonDestFolding(CondBrInst *BI, CondBrInst *PBI,
                                              DomTreeUpdater *DTU,
                                              MemorySSAUpdater *MSSAU,
                                              const TargetTransformInfo *TTI) {
@@ -4049,7 +4043,7 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
       MDWeights.push_back(PredTrueWeight * SuccTrueWeight);
       // FalseWeight is FalseWeight for PBI * TotalWeight for BI +
       //               TrueWeight for PBI * FalseWeight for BI.
-      // We assume that total weights of a BranchInst can fit into 32 bits.
+      // We assume that total weights of a CondBrInst can fit into 32 bits.
       // Therefore, we will not have overflow using 64-bit arithmetic.
       MDWeights.push_back(PredFalseWeight * (SuccFalseWeight + SuccTrueWeight) +
                           PredTrueWeight * SuccFalseWeight);
@@ -4124,15 +4118,10 @@ static bool isVectorOp(Instruction &I) {
 /// If this basic block is simple enough, and if a predecessor branches to us
 /// and one of our successors, fold the block into the predecessor and use
 /// logical operations to pick the right destination.
-bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
+bool llvm::foldBranchToCommonDest(CondBrInst *BI, DomTreeUpdater *DTU,
                                   MemorySSAUpdater *MSSAU,
                                   const TargetTransformInfo *TTI,
                                   unsigned BonusInstThreshold) {
-  // If this block ends with an unconditional branch,
-  // let speculativelyExecuteBB() deal with it.
-  if (!BI->isConditional())
-    return false;
-
   BasicBlock *BB = BI->getParent();
   TargetTransformInfo::TargetCostKind CostKind =
     BB->getParent()->hasMinSize() ? TargetTransformInfo::TCK_CodeSize
@@ -4151,12 +4140,12 @@ bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
   // With which predecessors will we want to deal with?
   SmallVector<BasicBlock *, 8> Preds;
   for (BasicBlock *PredBlock : predecessors(BB)) {
-    BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
+    CondBrInst *PBI = dyn_cast<CondBrInst>(PredBlock->getTerminator());
 
     // Check that we have two conditional branches.  If there is a PHI node in
     // the common successor, verify that the same value flows in from both
     // blocks.
-    if (!PBI || PBI->isUnconditional() || !safeToMergeTerminators(BI, PBI))
+    if (!PBI || !safeToMergeTerminators(BI, PBI))
       continue;
 
     // Determine if the two branches share a common destination.
@@ -4204,7 +4193,7 @@ bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
     if (&I == Cond)
       continue;
     // Ignore the terminator.
-    if (isa<BranchInst>(I))
+    if (isa<UncondBrInst, CondBrInst>(I))
       continue;
     // I must be safe to execute unconditionally.
     if (!isSafeToSpeculativelyExecute(&I))
@@ -4241,7 +4230,7 @@ bool llvm::foldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
 
   // Ok, we have the budget. Perform the transformation.
   for (BasicBlock *PredBlock : Preds) {
-    auto *PBI = cast<BranchInst>(PredBlock->getTerminator());
+    auto *PBI = cast<CondBrInst>(PredBlock->getTerminator());
     return performBranchToCommonDestFolding(BI, PBI, DTU, MSSAU, TTI);
   }
   return false;
@@ -4421,10 +4410,10 @@ static bool mergeConditionalStoreToAddress(
 
   // OK, we're going to sink the stores to PostBB. The store has to be
   // conditional though, so first create the predicate.
-  BranchInst *PBranch =
-      cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator());
-  BranchInst *QBranch =
-      cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator());
+  CondBrInst *PBranch =
+      cast<CondBrInst>(PFB->getSinglePredecessor()->getTerminator());
+  CondBrInst *QBranch =
+      cast<CondBrInst>(QFB->getSinglePredecessor()->getTerminator());
   Value *PCond = PBranch->getCondition();
   Value *QCond = QBranch->getCondition();
 
@@ -4478,7 +4467,7 @@ static bool mergeConditionalStoreToAddress(
   return true;
 }
 
-static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
+static bool mergeConditionalStores(CondBrInst *PBI, CondBrInst *QBI,
                                    DomTreeUpdater *DTU, const DataLayout &DL,
                                    const TargetTransformInfo &TTI) {
   // The intention here is to find diamonds or triangles (see below) where each
@@ -4590,7 +4579,7 @@ static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
 /// If the previous block ended with a widenable branch, determine if reusing
 /// the target block is profitable and legal.  This will have the effect of
 /// "widening" PBI, but doesn't require us to reason about hosting safety.
-static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
+static bool tryWidenCondBranchToCondBranch(CondBrInst *PBI, CondBrInst *BI,
                                            DomTreeUpdater *DTU) {
   // TODO: This can be generalized in two important ways:
   // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input
@@ -4647,11 +4636,10 @@ static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 /// this function tries to simplify it.  We know
 /// that PBI and BI are both conditional branches, and BI is in one of the
 /// successor blocks of PBI - PBI branches to BI.
-static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
+static bool SimplifyCondBranchToCondBranch(CondBrInst *PBI, CondBrInst *BI,
                                            DomTreeUpdater *DTU,
                                            const DataLayout &DL,
                                            const TargetTransformInfo &TTI) {
-  assert(PBI->isConditional() && BI->isConditional());
   BasicBlock *BB = BI->getParent();
 
   // If this block ends with a branch instruction, and if there is a
@@ -4762,7 +4750,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     // or it won't matter if it's hot. :)
     BasicBlock *InfLoopBlock =
         BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
-    BranchInst::Create(InfLoopBlock, InfLoopBlock);
+    UncondBrInst::Create(InfLoopBlock, InfLoopBlock);
     if (DTU)
       Updates.push_back({DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
     OtherDest = InfLoopBlock;
@@ -4917,7 +4905,7 @@ bool SimplifyCFGOpt::simplifyTerminatorOnSelect(Instruction *OldTerm,
     } else {
       // We found both of the successors we were looking for.
       // Create a conditional branch sharing the condition of the select.
-      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      CondBrInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
       setBranchWeights(*NewBI, {TrueWeight, FalseWeight},
                        /*IsExpected=*/false, /*ElideAllZero=*/true);
     }
@@ -5221,10 +5209,9 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpSelectInIt(
   return true;
 }
 
-/// The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
+bool SimplifyCFGOpt::simplifyBranchOnICmpChain(CondBrInst *BI,
                                                IRBuilder<> &Builder,
                                                const DataLayout &DL) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
@@ -5346,7 +5333,7 @@ bool SimplifyCFGOpt::simplifyBranchOnICmpChain(BranchInst *BI,
       X = Builder.CreateAdd(X, ConstantInt::get(CompVal->getType(), Offset));
     Value *Cond =
         Builder.CreateICmp(Pred, X, ConstantInt::get(CompVal->getType(), RHS));
-    BranchInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB);
+    CondBrInst *NewBI = Builder.CreateCondBr(Cond, EdgeBB, DefaultBB);
     if (HasProfile)
       setBranchWeights(*NewBI, BranchWeights, /*IsExpected=*/false);
     // We don't need to update PHI nodes since we don't add any new edges.
@@ -5657,7 +5644,7 @@ static bool mergeCleanupPad(CleanupReturnInst *RI) {
   SuccessorCleanupPad->eraseFromParent();
   // Now, we simply replace the cleanupret with a branch to the unwind
   // destination.
-  BranchInst::Create(UnwindDest, RI->getParent());
+  UncondBrInst::Create(UnwindDest, RI->getParent());
   RI->eraseFromParent();
 
   return true;
@@ -5731,16 +5718,20 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
   for (BasicBlock *Predecessor : Preds) {
     Instruction *TI = Predecessor->getTerminator();
     IRBuilder<> Builder(TI);
-    if (auto *BI = dyn_cast<BranchInst>(TI)) {
+    if (isa<UncondBrInst>(TI)) {
+      new UnreachableInst(TI->getContext(), TI->getIterator());
+      TI->eraseFromParent();
+      Changed = true;
+      if (DTU)
+        Updates.push_back({DominatorTree::Delete, Predecessor, BB});
+    } else if (auto *BI = dyn_cast<CondBrInst>(TI)) {
       // We could either have a proper unconditional branch,
       // or a degenerate conditional branch with matching destinations.
-      if (all_of(BI->successors(),
-                 [BB](auto *Successor) { return Successor == BB; })) {
+      if (BI->getSuccessor(0) == BI->getSuccessor(1)) {
         new UnreachableInst(TI->getContext(), TI->getIterator());
         TI->eraseFromParent();
         Changed = true;
       } else {
-        assert(BI->isConditional() && "Can't get here with an uncond branch.");
         Value* Cond = BI->getCondition();
         assert(BI->getSuccessor(0) != BI->getSuccessor(1) &&
                "The destinations are guaranteed to be different here.");
@@ -6032,7 +6023,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   Constant *Offset = ConstantExpr::getNeg(Min);
   Constant *NumCases = ConstantInt::get(Offset->getType(),
                                         Max->getValue() - Min->getValue() + 1);
-  BranchInst *NewBI;
+  Instruction *NewBI;
   if (NumCases->isOneValue()) {
     assert(Max->getValue() == Min->getValue());
     Value *Cmp = Builder.CreateICmpEQ(SI->getCondition(), Min);
@@ -6050,7 +6041,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
   }
 
   // Update weight for the newly-created conditional branch.
-  if (hasBranchWeightMD(*SI) && NewBI->isConditional()) {
+  if (hasBranchWeightMD(*SI) && isa<CondBrInst>(NewBI)) {
     SmallVector<uint64_t, 8> Weights;
     getBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
@@ -6085,7 +6076,7 @@ bool SimplifyCFGOpt::turnSwitchRangeIntoICmp(SwitchInst *SI,
       ++PreviousEdges;
     unsigned E = PreviousEdges - 1;
     // Remove all incoming values from OtherDest if OtherDest is unreachable.
-    if (NewBI->isUnconditional())
+    if (isa<UncondBrInst>(NewBI))
       ++E;
     for (unsigned I = 0; I != E; ++I)
       PHI.removeIncomingValue(SI->getParent());
@@ -6231,11 +6222,11 @@ static PHINode *findPHIForConditionForwarding(ConstantInt *CaseValue,
   if (!BB->getSinglePredecessor())
     return nullptr; // BB must be dominated by the switch.
 
-  BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
-  if (!Branch || !Branch->isUnconditional())
+  UncondBrInst *Branch = dyn_cast<UncondBrInst>(BB->getTerminator());
+  if (!Branch)
     return nullptr; // Terminator must be unconditional branch.
 
-  BasicBlock *Succ = Branch->getSuccessor(0);
+  BasicBlock *Succ = Branch->getSuccessor();
 
   for (PHINode &PHI : Succ->phis()) {
     int Idx = PHI.getBasicBlockIndex(BB);
@@ -7180,7 +7171,7 @@ static bool shouldUseSwitchConditionAsTableIndex(
 /// \endcode
 /// Jump threading will then eliminate the second if(cond).
 static void reuseTableCompare(
-    User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch,
+    User *PhiUser, BasicBlock *PhiBlock, CondBrInst *RangeCheckBranch,
     Constant *DefaultValue,
     const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) {
   ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
@@ -7464,8 +7455,8 @@ static bool simplifySwitchLookup(SwitchInst *SI, IRBuilder<> &Builder,
   BasicBlock *LookupBB = BasicBlock::Create(
       Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
 
-  BranchInst *RangeCheckBranch = nullptr;
-  BranchInst *CondBranch = nullptr;
+  CondBrInst *RangeCheckBranch = nullptr;
+  CondBrInst *CondBranch = nullptr;
 
   Builder.SetInsertPoint(SI);
   const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
@@ -7821,7 +7812,7 @@ static bool simplifySwitchOfPowersOfTwo(SwitchInst *SI, IRBuilder<> &Builder,
     SmallVector<uint32_t> Weights;
     auto HasWeights =
         !ProfcheckDisableMetadataFixes && extractBranchWeights(*SI, Weights);
-    auto *BI = BranchInst::Create(SplitBB, DefaultCaseBB, IsPow2, It);
+    auto *BI = CondBrInst::Create(IsPow2, SplitBB, DefaultCaseBB, It);
     if (HasWeights && any_of(Weights, not_equal_to(0))) {
       // IsPow2 covers a subset of the cases in which we'd go to the default
       // label. The other is those powers of 2 that don't appear in the case
@@ -8017,8 +8008,8 @@ struct EqualBBWrapper {
     // FIXME: Relax that the terminator is a BranchInst by checking for equality
     // on other kinds of terminators. We decide to only support unconditional
     // branches for now for compile time reasons.
-    auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!BI || !BI->isUnconditional())
+    auto *BI = dyn_cast<UncondBrInst>(BB->getTerminator());
+    if (!BI)
       return false;
 
     // Avoid blocks that are "address-taken" (blockaddress) or have unusual
@@ -8049,21 +8040,17 @@ template <> struct llvm::DenseMapInfo<const EqualBBWrapper *> {
   }
   static unsigned getHashValue(const EqualBBWrapper *EBW) {
     BasicBlock *BB = EBW->BB;
-    BranchInst *BI = cast<BranchInst>(BB->getTerminator());
-    assert(BI->isUnconditional() &&
-           "Only supporting unconditional branches for now");
-    assert(BI->getNumSuccessors() == 1 &&
-           "Expected unconditional branches to have one successor");
+    UncondBrInst *BI = cast<UncondBrInst>(BB->getTerminator());
     assert(BB->size() == 1 && "Expected just a single branch in the BB");
 
-    // Since we assume the BB is just a single BranchInst with a single
+    // Since we assume the BB is just a single UncondBrInst with a single
     // successor, we hash as the BB and the incoming Values of its successor
     // PHIs. Initially, we tried to just use the successor BB as the hash, but
     // including the incoming PHI values leads to better performance.
     // We also tried to build a map from BB -> Succs.IncomingValues ahead of
     // time and passing it in EqualBBWrapper, but this slowed down the average
     // compile time without having any impact on the worst case compile time.
-    BasicBlock *Succ = BI->getSuccessor(0);
+    BasicBlock *Succ = BI->getSuccessor();
     auto PhiValsForBB = map_range(Succ->phis(), [&](PHINode &Phi) {
       return (*EBW->PhiPredIVs)[&Phi][BB];
     });
@@ -8085,15 +8072,13 @@ template <> struct llvm::DenseMapInfo<const EqualBBWrapper *> {
     // B.size() here, and we need to check more than just the BranchInsts
     // for equality.
 
-    BranchInst *ABI = cast<BranchInst>(A->getTerminator());
-    BranchInst *BBI = cast<BranchInst>(B->getTerminator());
-    assert(ABI->isUnconditional() && BBI->isUnconditional() &&
-           "Only supporting unconditional branches for now");
-    if (ABI->getSuccessor(0) != BBI->getSuccessor(0))
+    UncondBrInst *ABI = cast<UncondBrInst>(A->getTerminator());
+    UncondBrInst *BBI = cast<UncondBrInst>(B->getTerminator());
+    if (ABI->getSuccessor() != BBI->getSuccessor())
       return false;
 
     // Need to check that PHIs in successor have matching values.
-    BasicBlock *Succ = ABI->getSuccessor(0);
+    BasicBlock *Succ = ABI->getSuccessor();
     auto IfPhiIVMatch = [&](PHINode &Phi) {
       // Replace O(|Pred|) Phi.getIncomingValueForBlock with this O(1) hashmap
       // query.
@@ -8359,7 +8344,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 
   if (IBI->getNumDestinations() == 1) {
     // If the indirectbr has one successor, change it to a direct branch.
-    BranchInst::Create(IBI->getDestination(0), IBI->getIterator());
+    UncondBrInst::Create(IBI->getDestination(0), IBI->getIterator());
     eraseTerminatorAndDCECond(IBI);
     return true;
   }
@@ -8397,7 +8382,7 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
 /// TODO - This transformation could remove entries from a phi in the target
 /// block when the inputs in the phi are the same for the two blocks being
 /// merged.  In some cases, this could result in removal of the PHI entirely.
-static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
+static bool tryToMergeLandingPad(LandingPadInst *LPad, UncondBrInst *BI,
                                  BasicBlock *BB, DomTreeUpdater *DTU) {
   auto Succ = BB->getUniqueSuccessor();
   assert(Succ);
@@ -8414,7 +8399,7 @@ static bool tryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
     if (!LPad2 || !LPad2->isIdenticalTo(LPad))
       continue;
     ++I;
-    BranchInst *BI2 = dyn_cast<BranchInst>(I);
+    UncondBrInst *BI2 = dyn_cast<UncondBrInst>(I);
     if (!BI2 || !BI2->isIdenticalTo(BI))
       continue;
 
@@ -8495,14 +8480,6 @@ bool SimplifyCFGOpt::simplifyUncondBranch(UncondBrInst *BI,
       return true;
   }
 
-  // If this basic block is ONLY a compare and a branch, and if a predecessor
-  // branches to us and our successor, fold the comparison into the
-  // predecessor and use logical operations to update the incoming value
-  // for PHI nodes in common successor.
-  if (Options.SpeculateBlocks &&
-      foldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI,
-                             Options.BonusInstThreshold))
-    return requestResimplify();
   return false;
 }
 
@@ -8537,24 +8514,24 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
 /// bb4:
 ///   ...
 /// NOTE: %cond2 always dominates the terminator of bb0.
-static bool mergeNestedCondBranch(BranchInst *BI, DomTreeUpdater *DTU) {
+static bool mergeNestedCondBranch(CondBrInst *BI, DomTreeUpdater *DTU) {
   BasicBlock *BB = BI->getParent();
   BasicBlock *BB1 = BI->getSuccessor(0);
   BasicBlock *BB2 = BI->getSuccessor(1);
-  auto IsSimpleSuccessor = [BB](BasicBlock *Succ, BranchInst *&SuccBI) {
+  auto IsSimpleSuccessor = [BB](BasicBlock *Succ, CondBrInst *&SuccBI) {
     if (Succ == BB)
       return false;
     if (&Succ->front() != Succ->getTerminator())
       return false;
-    SuccBI = dyn_cast<BranchInst>(Succ->getTerminator());
-    if (!SuccBI || !SuccBI->isConditional())
+    SuccBI = dyn_cast<CondBrInst>(Succ->getTerminator());
+    if (!SuccBI)
       return false;
     BasicBlock *Succ1 = SuccBI->getSuccessor(0);
     BasicBlock *Succ2 = SuccBI->getSuccessor(1);
     return Succ1 != Succ && Succ2 != Succ && Succ1 != BB && Succ2 != BB &&
            !isa<PHINode>(Succ1->front()) && !isa<PHINode>(Succ2->front());
   };
-  BranchInst *BB1BI, *BB2BI;
+  CondBrInst *BB1BI, *BB2BI;
   if (!IsSimpleSuccessor(BB1, BB1BI) || !IsSimpleSuccessor(BB2, BB2BI))
     return false;
 
@@ -8728,16 +8705,16 @@ bool SimplifyCFGOpt::simplifyCondBranch(CondBrInst *BI, IRBuilder<> &Builder) {
 
   // Scan predecessor blocks for conditional branches.
   for (BasicBlock *Pred : predecessors(BB))
-    if (BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator()))
-      if (PBI != BI && PBI->isConditional())
+    if (CondBrInst *PBI = dyn_cast<CondBrInst>(Pred->getTerminator()))
+      if (PBI != BI)
         if (SimplifyCondBranchToCondBranch(PBI, BI, DTU, DL, TTI))
           return requestResimplify();
 
   // Look for diamond patterns.
   if (MergeCondStores)
     if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
-      if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
-        if (PBI != BI && PBI->isConditional())
+      if (CondBrInst *PBI = dyn_cast<CondBrInst>(PrevBB->getTerminator()))
+        if (PBI != BI)
           if (mergeConditionalStores(PBI, BI, DTU, DL, TTI))
             return requestResimplify();
 
@@ -8896,26 +8873,28 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB,
         BasicBlock *Predecessor = PHI.getIncomingBlock(i);
         Instruction *T = Predecessor->getTerminator();
         IRBuilder<> Builder(T);
-        if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+        if (isa<UncondBrInst>(T)) {
           BB->removePredecessor(Predecessor);
-          // Turn unconditional branches into unreachables and remove the dead
-          // destination from conditional branches.
-          if (BI->isUnconditional())
-            Builder.CreateUnreachable();
-          else {
-            // Preserve guarding condition in assume, because it might not be
-            // inferrable from any dominating condition.
-            Value *Cond = BI->getCondition();
-            CallInst *Assumption;
-            if (BI->getSuccessor(0) == BB)
-              Assumption = Builder.CreateAssumption(Builder.CreateNot(Cond));
-            else
-              Assumption = Builder.CreateAssumption(Cond);
-            if (AC)
-              AC->registerAssumption(cast<AssumeInst>(Assumption));
-            Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
-                                                       : BI->getSuccessor(0));
-          }
+          // Turn unconditional branches into unreachables.
+          Builder.CreateUnreachable();
+          T->eraseFromParent();
+          if (DTU)
+            DTU->applyUpdates({{DominatorTree::Delete, Predecessor, BB}});
+          return true;
+        } else if (CondBrInst *BI = dyn_cast<CondBrInst>(T)) {
+          BB->removePredecessor(Predecessor);
+          // Preserve guarding condition in assume, because it might not be
+          // inferrable from any dominating condition.
+          Value *Cond = BI->getCondition();
+          CallInst *Assumption;
+          if (BI->getSuccessor(0) == BB)
+            Assumption = Builder.CreateAssumption(Builder.CreateNot(Cond));
+          else
+            Assumption = Builder.CreateAssumption(Cond);
+          if (AC)
+            AC->registerAssumption(cast<AssumeInst>(Assumption));
+          Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
+                                                     : BI->getSuccessor(0));
           BI->eraseFromParent();
           if (DTU)
             DTU->applyUpdates({{DominatorTree::Delete, Predecessor, BB}});
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 792c1ac31c2ba..9e0bbe9cf3b62 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -2231,22 +2231,17 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
     auto *TI = BB->getTerminator();
     UpdateRangeFromGuards(TI);
 
-    auto *BI = dyn_cast<BranchInst>(TI);
-    if (!BI || !BI->isConditional())
+    auto *BI = dyn_cast<CondBrInst>(TI);
+    if (!BI)
       continue;
 
     auto *TrueSuccessor = BI->getSuccessor(0);
     auto *FalseSuccessor = BI->getSuccessor(1);
 
-    auto DominatesNarrowUser = [this, NarrowUser] (BasicBlockEdge BBE) {
-      return BBE.isSingleEdge() &&
-             DT->dominates(BBE, NarrowUser->getParent());
-    };
-
-    if (DominatesNarrowUser(BasicBlockEdge(BB, TrueSuccessor)))
+    if (DT->dominates(BasicBlockEdge(BB, TrueSuccessor), NarrowUserBB))
       UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/true);
 
-    if (DominatesNarrowUser(BasicBlockEdge(BB, FalseSuccessor)))
+    if (DT->dominates(BasicBlockEdge(BB, FalseSuccessor), NarrowUserBB))
       UpdateRangeFromCondition(BI->getCondition(), /*TrueDest=*/false);
   }
 }
diff --git a/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 68d0684a64d68..17fa30e436c2f 100644
--- a/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -36,7 +36,7 @@ bool unifyUnreachableBlocks(Function &F) {
 
   for (BasicBlock *BB : UnreachableBlocks) {
     BB->back().eraseFromParent(); // Remove the unreachable inst.
-    BranchInst::Create(UnreachableBlock, BB);
+    UncondBrInst::Create(UnreachableBlock, BB);
   }
 
   return true;
@@ -78,7 +78,7 @@ bool unifyReturnBlocks(Function &F) {
       PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
 
     BB->back().eraseFromParent(); // Remove the return insn
-    BranchInst::Create(NewRetBlock, BB);
+    UncondBrInst::Create(NewRetBlock, BB);
   }
 
   return true;
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
index 21baf4ec1d032..6947cb4f92723 100644
--- a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -167,12 +167,18 @@ static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
 
   for (unsigned I = 0; I < ExitingBlocks.size(); ++I) {
     BasicBlock *BB = ExitingBlocks[I];
-    if (BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator())) {
+    if (UncondBrInst *Branch = dyn_cast<UncondBrInst>(BB->getTerminator())) {
       BasicBlock *Succ0 = Branch->getSuccessor(0);
       Succ0 = L->contains(Succ0) ? nullptr : Succ0;
+      CHub.addBranch(BB, Succ0);
 
-      BasicBlock *Succ1 =
-          Branch->isUnconditional() ? nullptr : Branch->getSuccessor(1);
+      LLVM_DEBUG(dbgs() << "Added extiting branch: " << printBasicBlock(BB)
+                        << " -> " << printBasicBlock(Succ0) << '\n');
+    } else if (CondBrInst *Branch = dyn_cast<CondBrInst>(BB->getTerminator())) {
+      BasicBlock *Succ0 = Branch->getSuccessor(0);
+      Succ0 = L->contains(Succ0) ? nullptr : Succ0;
+
+      BasicBlock *Succ1 = Branch->getSuccessor(1);
       Succ1 = L->contains(Succ1) ? nullptr : Succ1;
       CHub.addBranch(BB, Succ0, Succ1);
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
index 9bc82bf3dbb21..4ed6c24961932 100644
--- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp
@@ -499,8 +499,7 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
   Value *PFalse = Builder.CreateVectorSplat(PredVTy->getElementCount(),
                                             Builder.getInt1(false));
 
-  BranchInst *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
-  Builder.Insert(JumpToVectorLoop);
+  Builder.CreateBr(VectorLoopStartBlock);
 
   DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
                      VectorLoopStartBlock}});
@@ -529,9 +528,8 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
   Value *VectorMatchCmp = Builder.CreateICmpNE(VectorLhsLoad, VectorRhsLoad);
   VectorMatchCmp = Builder.CreateSelect(LoopPred, VectorMatchCmp, PFalse);
   Value *VectorMatchHasActiveLanes = Builder.CreateOrReduce(VectorMatchCmp);
-  BranchInst *VectorEarlyExit = BranchInst::Create(
-      VectorLoopMismatchBlock, VectorLoopIncBlock, VectorMatchHasActiveLanes);
-  Builder.Insert(VectorEarlyExit);
+  Builder.CreateCondBr(VectorMatchHasActiveLanes, VectorLoopMismatchBlock,
+                       VectorLoopIncBlock);
 
   DTU.applyUpdates(
       {{DominatorTree::Insert, VectorLoopStartBlock, VectorLoopMismatchBlock},
@@ -552,9 +550,7 @@ Value *LoopIdiomVectorize::createMaskedFindMismatch(
 
   Value *PredHasActiveLanes =
       Builder.CreateExtractElement(NewPred, uint64_t(0));
-  BranchInst *VectorLoopBranchBack =
-      BranchInst::Create(VectorLoopStartBlock, EndBlock, PredHasActiveLanes);
-  Builder.Insert(VectorLoopBranchBack);
+  Builder.CreateCondBr(PredHasActiveLanes, VectorLoopStartBlock, EndBlock);
 
   DTU.applyUpdates(
       {{DominatorTree::Insert, VectorLoopIncBlock, VectorLoopStartBlock},
@@ -590,7 +586,7 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
   Value *PtrA = GEPA->getPointerOperand();
   Value *PtrB = GEPB->getPointerOperand();
 
-  auto *JumpToVectorLoop = BranchInst::Create(VectorLoopStartBlock);
+  auto *JumpToVectorLoop = UncondBrInst::Create(VectorLoopStartBlock);
   Builder.Insert(JumpToVectorLoop);
 
   DTU.applyUpdates({{DominatorTree::Insert, VectorLoopPreheaderBlock,
@@ -635,8 +631,8 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
       {VectorMatchCmp, /*ZeroIsPoison=*/Builder.getInt1(false), AllTrueMask,
        VL});
   Value *MismatchFound = Builder.CreateICmpNE(CTZ, VL);
-  auto *VectorEarlyExit = BranchInst::Create(VectorLoopMismatchBlock,
-                                             VectorLoopIncBlock, MismatchFound);
+  auto *VectorEarlyExit = CondBrInst::Create(
+      MismatchFound, VectorLoopMismatchBlock, VectorLoopIncBlock);
   Builder.Insert(VectorEarlyExit);
 
   DTU.applyUpdates(
@@ -654,7 +650,7 @@ Value *LoopIdiomVectorize::createPredicatedFindMismatch(
   VectorIndexPhi->addIncoming(NewVectorIndexPhi, VectorLoopIncBlock);
   Value *ExitCond = Builder.CreateICmpNE(NewVectorIndexPhi, ExtEnd);
   auto *VectorLoopBranchBack =
-      BranchInst::Create(VectorLoopStartBlock, EndBlock, ExitCond);
+      CondBrInst::Create(ExitCond, VectorLoopStartBlock, EndBlock);
   Builder.Insert(VectorLoopBranchBack);
 
   DTU.applyUpdates(
@@ -686,7 +682,7 @@ Value *LoopIdiomVectorize::expandFindMismatch(
 
   // Get the arguments and types for the intrinsic.
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
-  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+  Instruction *PHBranch = Preheader->getTerminator();
   LLVMContext &Ctx = PHBranch->getContext();
   Type *LoadType = Type::getInt8Ty(Ctx);
   Type *ResType = Builder.getInt32Ty();
@@ -775,8 +771,8 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   // This check doesn't really cost us very much.
 
   Value *LimitCheck = Builder.CreateICmpULE(Start, MaxLen);
-  BranchInst *MinItCheckBr =
-      BranchInst::Create(MemCheckBlock, LoopPreHeaderBlock, LimitCheck);
+  CondBrInst *MinItCheckBr =
+      CondBrInst::Create(LimitCheck, MemCheckBlock, LoopPreHeaderBlock);
   MinItCheckBr->setMetadata(
       LLVMContext::MD_prof,
       MDBuilder(MinItCheckBr->getContext()).createBranchWeights(99, 1));
@@ -821,8 +817,8 @@ Value *LoopIdiomVectorize::expandFindMismatch(
   Value *RhsPageCmp = Builder.CreateICmpNE(RhsStartPage, RhsEndPage);
 
   Value *CombinedPageCmp = Builder.CreateOr(LhsPageCmp, RhsPageCmp);
-  BranchInst *CombinedPageCmpCmpBr = BranchInst::Create(
-      LoopPreHeaderBlock, VectorLoopPreheaderBlock, CombinedPageCmp);
+  CondBrInst *CombinedPageCmpCmpBr = CondBrInst::Create(
+      CombinedPageCmp, LoopPreHeaderBlock, VectorLoopPreheaderBlock);
   CombinedPageCmpCmpBr->setMetadata(
       LLVMContext::MD_prof, MDBuilder(CombinedPageCmpCmpBr->getContext())
                                 .createBranchWeights(10, 90));
@@ -854,14 +850,14 @@ Value *LoopIdiomVectorize::expandFindMismatch(
     break;
   }
 
-  Builder.Insert(BranchInst::Create(EndBlock));
+  Builder.CreateBr(EndBlock);
 
   DTU.applyUpdates(
       {{DominatorTree::Insert, VectorLoopMismatchBlock, EndBlock}});
 
   // Generate code for scalar loop.
   Builder.SetInsertPoint(LoopPreHeaderBlock);
-  Builder.Insert(BranchInst::Create(LoopStartBlock));
+  Builder.CreateBr(LoopStartBlock);
 
   DTU.applyUpdates(
       {{DominatorTree::Insert, LoopPreHeaderBlock, LoopStartBlock}});
@@ -884,8 +880,7 @@ Value *LoopIdiomVectorize::expandFindMismatch(
 
   Value *MatchCmp = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
   // If we have a mismatch then exit the loop ...
-  BranchInst *MatchCmpBr = BranchInst::Create(LoopIncBlock, EndBlock, MatchCmp);
-  Builder.Insert(MatchCmpBr);
+  Builder.CreateCondBr(MatchCmp, LoopIncBlock, EndBlock);
 
   DTU.applyUpdates({{DominatorTree::Insert, LoopStartBlock, LoopIncBlock},
                     {DominatorTree::Insert, LoopStartBlock, EndBlock}});
@@ -897,8 +892,7 @@ Value *LoopIdiomVectorize::expandFindMismatch(
                                     /*HasNSW=*/Index->hasNoSignedWrap());
   IndexPhi->addIncoming(PhiInc, LoopIncBlock);
   Value *IVCmp = Builder.CreateICmpEQ(PhiInc, MaxLen);
-  BranchInst *IVCmpBr = BranchInst::Create(EndBlock, LoopStartBlock, IVCmp);
-  Builder.Insert(IVCmpBr);
+  Builder.CreateCondBr(IVCmp, EndBlock, LoopStartBlock);
 
   DTU.applyUpdates({{DominatorTree::Insert, LoopIncBlock, EndBlock},
                     {DominatorTree::Insert, LoopIncBlock, LoopStartBlock}});
@@ -941,7 +935,7 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   // Insert the byte compare code at the end of the preheader block
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
   BasicBlock *Header = CurLoop->getHeader();
-  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+  UncondBrInst *PHBranch = cast<UncondBrInst>(Preheader->getTerminator());
   IRBuilder<> Builder(PHBranch);
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
@@ -958,9 +952,6 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA,
   assert(IndPhi->hasOneUse() && "Index phi node has more than one use!");
   Index->replaceAllUsesWith(ByteCmpRes);
 
-  assert(PHBranch->isUnconditional() &&
-         "Expected preheader to terminate with an unconditional branch.");
-
   // If no mismatch was found, we can jump to the end block. Create a
   // new basic block for the compare instruction.
   auto *CmpBB = BasicBlock::Create(Preheader->getContext(), "byte.compare",
@@ -1329,7 +1320,7 @@ Value *LoopIdiomVectorize::expandFindFirstByte(
 
   Value *CombinedPageCmp =
       Builder.CreateOr(SearchPageCmp, NeedlePageCmp, "combined_page_cmp");
-  BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1);
+  CondBrInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1);
   CombinedPageBr->setMetadata(LLVMContext::MD_prof,
                               MDBuilder(Ctx).createBranchWeights(10, 90));
   DTU.applyUpdates(
@@ -1450,7 +1441,7 @@ void LoopIdiomVectorize::transformFindFirstByte(
     Value *NeedleStart, Value *NeedleEnd) {
   // Insert the find first byte code at the end of the preheader block.
   BasicBlock *Preheader = CurLoop->getLoopPreheader();
-  BranchInst *PHBranch = cast<BranchInst>(Preheader->getTerminator());
+  UncondBrInst *PHBranch = cast<UncondBrInst>(Preheader->getTerminator());
   IRBuilder<> Builder(PHBranch);
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
   Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc());
@@ -1458,9 +1449,6 @@ void LoopIdiomVectorize::transformFindFirstByte(
   expandFindFirstByte(Builder, DTU, VF, CharTy, IndPhi, ExitSucc, ExitFail,
                       SearchStart, SearchEnd, NeedleStart, NeedleEnd);
 
-  assert(PHBranch->isUnconditional() &&
-         "Expected preheader to terminate with an unconditional branch.");
-
   if (VerifyLoops && CurLoop->getParentLoop()) {
     CurLoop->getParentLoop()->verifyLoop();
     if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index ea18cddd3df73..be0288b699d59 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -375,8 +375,8 @@ static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
 
   // 2.
   BasicBlock *Latch = Lp->getLoopLatch();
-  auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!LatchBr || LatchBr->isUnconditional()) {
+  auto *LatchBr = dyn_cast<CondBrInst>(Latch->getTerminator());
+  if (!LatchBr) {
     LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
     return false;
   }
@@ -638,10 +638,10 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
 
   for (BasicBlock *BB : TheLoop->blocks()) {
-    // Check whether the BB terminator is a BranchInst. Any other terminator is
+    // Check whether the BB terminator is a branch. Any other terminator is
     // not supported yet.
-    auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!Br) {
+    Instruction *Term = BB->getTerminator();
+    if (!isa<UncondBrInst, CondBrInst>(Term)) {
       reportVectorizationFailure("Unsupported basic block terminator",
           "loop control flow is not understood by vectorizer",
           "CFGNotUnderstood", ORE, TheLoop);
@@ -651,14 +651,14 @@ bool LoopVectorizationLegality::canVectorizeOuterLoop() {
         return false;
     }
 
-    // Check whether the BranchInst is a supported one. Only unconditional
+    // Check whether the branch is a supported one. Only unconditional
     // branches, conditional branches with an outer loop invariant condition or
     // backedges are supported.
     // FIXME: We skip these checks when VPlan predication is enabled as we
     // want to allow divergent branches. This whole check will be removed
     // once VPlan predication is on by default.
-    if (Br && Br->isConditional() &&
-        !TheLoop->isLoopInvariant(Br->getCondition()) &&
+    auto *Br = dyn_cast<CondBrInst>(Term);
+    if (Br && !TheLoop->isLoopInvariant(Br->getCondition()) &&
         !LI->isLoopHeader(Br->getSuccessor(0)) &&
         !LI->isLoopHeader(Br->getSuccessor(1))) {
       reportVectorizationFailure("Unsupported conditional branch",
@@ -1594,7 +1594,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
                                    TheLoop, BB->getTerminator());
         return false;
       }
-    } else if (!isa<BranchInst>(BB->getTerminator())) {
+    } else if (!isa<UncondBrInst, CondBrInst>(BB->getTerminator())) {
       reportVectorizationFailure("Loop contains an unsupported terminator",
                                  "LoopContainsUnsupportedTerminator", ORE,
                                  TheLoop, BB->getTerminator());
@@ -1654,11 +1654,11 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
       return false;
   }
 
-  // The latch must be terminated by a BranchInst.
+  // The latch must be terminated by a branch.
   BasicBlock *Latch = Lp->getLoopLatch();
-  if (Latch && !isa<BranchInst>(Latch->getTerminator())) {
+  if (Latch && !isa<UncondBrInst, CondBrInst>(Latch->getTerminator())) {
     reportVectorizationFailure(
-        "The loop latch terminator is not a BranchInst",
+        "The loop latch terminator is not a UncondBrInst/CondBrInst",
         "loop control flow is not understood by vectorizer", "CFGNotUnderstood",
         ORE, TheLoop);
     if (DoExtraAnalysis)
@@ -1860,7 +1860,7 @@ bool LoopVectorizationLegality::canUncountableExitConditionLoadBeMoved(
   // FIXME: We're insisting on a single use for now, because otherwise we will
   // need to make PHI nodes for other users. That can be done once the initial
   // transform code lands.
-  auto *Br = cast<BranchInst>(ExitingBlock->getTerminator());
+  auto *Br = cast<CondBrInst>(ExitingBlock->getTerminator());
 
   using namespace llvm::PatternMatch;
   Instruction *L = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6b8f12891cd5e..ac9b790c739bf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2423,20 +2423,6 @@ BasicBlock *InnerLoopVectorizer::createScalarPreheader(StringRef Prefix) {
                     Twine(Prefix) + "scalar.ph");
 }
 
-/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
-/// expansion results.
-static Value *getExpandedStep(const InductionDescriptor &ID,
-                              const SCEV2ValueTy &ExpandedSCEVs) {
-  const SCEV *Step = ID.getStep();
-  if (auto *C = dyn_cast<SCEVConstant>(Step))
-    return C->getValue();
-  if (auto *U = dyn_cast<SCEVUnknown>(Step))
-    return U->getValue();
-  Value *V = ExpandedSCEVs.lookup(Step);
-  assert(V && "SCEV must be expanded at this point");
-  return V;
-}
-
 /// Knowing that loop \p L executes a single vector iteration, add instructions
 /// that will get simplified and thus should not have any cost to \p
 /// InstsToIgnore.
@@ -2859,7 +2845,7 @@ bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
   // context sensitive reasoning for isSafeToSpeculativelyExecute.
   if (isSafeToSpeculativelyExecute(I) ||
       (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||
-      isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))
+      isa<UncondBrInst, CondBrInst, SwitchInst, PHINode, AllocaInst>(I))
     return false;
 
   // If the instruction was executed conditionally in the original scalar loop,
@@ -6171,8 +6157,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // a single branch controlling the loop, so there is no extra overhead from
     // scalarization.
     bool ScalarPredicatedBB = false;
-    BranchInst *BI = cast<BranchInst>(I);
-    if (VF.isVector() && BI->isConditional() &&
+    CondBrInst *BI = dyn_cast<CondBrInst>(I);
+    if (VF.isVector() && BI &&
         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&
         BI->getParent() != TheLoop->getLoopLatch())
@@ -6577,10 +6563,8 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
 
       // Queue branches for analysis. They are dead, if their successors only
       // contain dead instructions.
-      if (auto *Br = dyn_cast<BranchInst>(&I)) {
-        if (Br->isConditional())
-          DeadOps.push_back(&I);
-      }
+      if (isa<CondBrInst>(&I))
+        DeadOps.push_back(&I);
     }
 
   // Mark ops feeding interleave group members as free, if they are only used
@@ -6608,14 +6592,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
   auto IsEmptyBlock = [this](BasicBlock *BB) {
     return all_of(*BB, [this](Instruction &I) {
       return ValuesToIgnore.contains(&I) || VecValuesToIgnore.contains(&I) ||
-             (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());
+             isa<UncondBrInst>(&I);
     });
   };
   for (unsigned I = 0; I != DeadOps.size(); ++I) {
     auto *Op = dyn_cast<Instruction>(DeadOps[I]);
 
     // Check if the branch should be considered dead.
-    if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {
+    if (auto *Br = dyn_cast_or_null<CondBrInst>(Op)) {
       BasicBlock *ThenBB = Br->getSuccessor(0);
       BasicBlock *ElseBB = Br->getSuccessor(1);
       // Don't considers branches leaving the loop for simplification.
@@ -6961,7 +6945,7 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
   SetVector<Instruction *> ExitInstrs;
   // Collect all exit conditions.
   for (BasicBlock *EB : Exiting) {
-    auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
+    auto *Term = dyn_cast<CondBrInst>(EB->getTerminator());
     if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
       continue;
     if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
@@ -7342,73 +7326,6 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
   return BestFactor;
 }
 
-// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
-// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
-// from the main vector loop.
-static void fixReductionScalarResumeWhenVectorizingEpilog(
-    VPPhi *EpiResumePhiR, PHINode &EpiResumePhi, BasicBlock *BypassBlock) {
-  using namespace VPlanPatternMatch;
-  // Get the VPInstruction computing the reduction result in the middle block.
-  // The first operand may not be from the middle block if it is not connected
-  // to the scalar preheader. In that case, there's nothing to fix.
-  VPValue *Incoming = EpiResumePhiR->getOperand(0);
-  match(Incoming, VPlanPatternMatch::m_ZExtOrSExt(
-                      VPlanPatternMatch::m_VPValue(Incoming)));
-  auto *EpiRedResult = dyn_cast<VPInstruction>(Incoming);
-  if (!EpiRedResult)
-    return;
-
-  VPValue *BackedgeVal;
-  bool IsFindIV = false;
-  if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult ||
-      EpiRedResult->getOpcode() == VPInstruction::ComputeReductionResult)
-    BackedgeVal = EpiRedResult->getOperand(EpiRedResult->getNumOperands() - 1);
-  else if (matchFindIVResult(EpiRedResult, m_VPValue(BackedgeVal), m_VPValue()))
-    IsFindIV = true;
-  else
-    return;
-
-  auto *EpiRedHeaderPhi = cast_if_present<VPReductionPHIRecipe>(
-      vputils::findRecipe(BackedgeVal, IsaPred<VPReductionPHIRecipe>));
-  if (!EpiRedHeaderPhi) {
-    match(BackedgeVal,
-          VPlanPatternMatch::m_Select(VPlanPatternMatch::m_VPValue(),
-                                      VPlanPatternMatch::m_VPValue(BackedgeVal),
-                                      VPlanPatternMatch::m_VPValue()));
-    EpiRedHeaderPhi = cast<VPReductionPHIRecipe>(
-        vputils::findRecipe(BackedgeVal, IsaPred<VPReductionPHIRecipe>));
-  }
-
-  Value *MainResumeValue;
-  if (auto *VPI = dyn_cast<VPInstruction>(EpiRedHeaderPhi->getStartValue())) {
-    assert((VPI->getOpcode() == VPInstruction::Broadcast ||
-            VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
-           "unexpected start recipe");
-    MainResumeValue = VPI->getOperand(0)->getUnderlyingValue();
-  } else
-    MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
-  if (EpiRedResult->getOpcode() == VPInstruction::ComputeAnyOfResult) {
-    [[maybe_unused]] Value *StartV =
-        EpiRedResult->getOperand(0)->getLiveInIRValue();
-    auto *Cmp = cast<ICmpInst>(MainResumeValue);
-    assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
-           "AnyOf expected to start with ICMP_NE");
-    assert(Cmp->getOperand(1) == StartV &&
-           "AnyOf expected to start by comparing main resume value to original "
-           "start value");
-    MainResumeValue = Cmp->getOperand(0);
-  } else if (IsFindIV) {
-    MainResumeValue = cast<SelectInst>(MainResumeValue)->getFalseValue();
-  }
-  PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
-
-  // When fixing reductions in the epilogue loop we should already have
-  // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
-  // over the incoming values correctly.
-  EpiResumePhi.setIncomingValueForBlock(
-      BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
-}
-
 DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
     InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
@@ -7580,7 +7497,7 @@ BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
       emitIterationCountCheck(VectorPH, ScalarPH, true);
   EPI.EpilogueIterationCountCheck->setName("iter.check");
 
-  VectorPH = cast<BranchInst>(EPI.EpilogueIterationCountCheck->getTerminator())
+  VectorPH = cast<CondBrInst>(EPI.EpilogueIterationCountCheck->getTerminator())
                  ->getSuccessor(1);
   // Generate the iteration count check for the main loop, *after* the check
   // for the epilogue loop, so that the path-length is shorter for the case
@@ -7591,7 +7508,7 @@ BasicBlock *EpilogueVectorizerMainLoop::createVectorizedLoopSkeleton() {
   EPI.MainLoopIterationCountCheck =
       emitIterationCountCheck(VectorPH, ScalarPH, false);
 
-  return cast<BranchInst>(EPI.MainLoopIterationCountCheck->getTerminator())
+  return cast<CondBrInst>(EPI.MainLoopIterationCountCheck->getTerminator())
       ->getSuccessor(1);
 }
 
@@ -7638,7 +7555,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitIterationCountCheck(
     VectorPHVPBB = replaceVPBBWithIRVPBB(VectorPHVPBB, VectorPH);
   }
 
-  BranchInst &BI = *BranchInst::Create(Bypass, VectorPH, CheckMinIters);
+  CondBrInst &BI = *CondBrInst::Create(CheckMinIters, Bypass, VectorPH);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
@@ -7904,8 +7821,8 @@ VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(VPInstruction *VPI,
 }
 
 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
-  assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
-         !isa<StoreInst>(I) && "Instruction should have been handled earlier");
+  assert((!isa<UncondBrInst, CondBrInst, PHINode, LoadInst, StoreInst>(I)) &&
+         "Instruction should have been handled earlier");
   // Instruction should be widened, unless it is scalar after vectorization,
   // scalarization is profitable or it is predicated.
   auto WillScalarize = [this, I](ElementCount VF) -> bool {
@@ -8128,13 +8045,21 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
       OrigLoop, *LI, Legal->getWidestInductionType(),
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE, &LVer);
 
+  VPlanTransforms::simplifyRecipes(*VPlan0);
+  VPlanTransforms::handleEarlyExits(*VPlan0, Legal->hasUncountableEarlyExit());
+  VPlanTransforms::addMiddleCheck(*VPlan0, CM.foldTailByMasking());
+  RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::createLoopRegions, *VPlan0);
+
   // Create recipes for header phis.
   VPlanTransforms::createHeaderPhiRecipes(
       *VPlan0, PSE, *OrigLoop, Legal->getInductionVars(),
       Legal->getReductionVars(), Legal->getFixedOrderRecurrences(),
       CM.getInLoopReductions(), Hints.allowReordering());
 
-  VPlanTransforms::simplifyRecipes(*VPlan0);
+  if (CM.foldTailByMasking())
+    RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *VPlan0);
+  RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize,
+                           *VPlan0);
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8181,13 +8106,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
             return !CM.requiresScalarEpilogue(VF.isVector());
           },
           Range);
-  VPlanTransforms::handleEarlyExits(*Plan, Legal->hasUncountableEarlyExit());
-  VPlanTransforms::addMiddleCheck(*Plan, RequiresScalarEpilogueCheck,
-                                  CM.foldTailByMasking());
-
-  RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::createLoopRegions, *Plan);
-  if (CM.foldTailByMasking())
-    RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::foldTailByMasking, *Plan);
+  // Update the branch in the middle block if a scalar epilogue is required.
+  VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
+  if (!RequiresScalarEpilogueCheck && MiddleVPBB->getNumSuccessors() == 2) {
+    auto *BranchOnCond = cast<VPInstruction>(MiddleVPBB->getTerminator());
+    assert(MiddleVPBB->getSuccessors()[1] == Plan->getScalarPreheader() &&
+           "second successor must be scalar preheader");
+    BranchOnCond->setOperand(0, Plan->getFalse());
+  }
 
   // Don't use getDecisionAndClampRange here, because we don't know the UF
   // so this function is better to be conservative, rather than to split
@@ -8239,11 +8165,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
     InterleaveGroups.insert(IG);
   }
 
-  // ---------------------------------------------------------------------------
-  // Predicate and linearize the top-level loop region.
-  // ---------------------------------------------------------------------------
-  RUN_VPLAN_PASS_NO_VERIFY(VPlanTransforms::introduceMasksAndLinearize, *Plan);
-
   // ---------------------------------------------------------------------------
   // Construct wide recipes and apply predication for original scalar
   // VPInstructions in the loop.
@@ -8256,7 +8177,6 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
   ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
       HeaderVPBB);
 
-  auto *MiddleVPBB = Plan->getMiddleBlock();
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
 
   // Collect blocks that need predication for in-loop reduction recipes.
@@ -8426,17 +8346,17 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
       OrigLoop, *LI, Legal->getWidestInductionType(),
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()), PSE);
 
+  VPlanTransforms::handleEarlyExits(*Plan,
+                                    /*HasUncountableExit*/ false);
+  VPlanTransforms::addMiddleCheck(*Plan, /*TailFolded*/ false);
+
+  VPlanTransforms::createLoopRegions(*Plan);
+
   VPlanTransforms::createHeaderPhiRecipes(
       *Plan, PSE, *OrigLoop, Legal->getInductionVars(),
       MapVector<PHINode *, RecurrenceDescriptor>(),
       SmallPtrSet<const PHINode *, 1>(), SmallPtrSet<PHINode *, 1>(),
       /*AllowReordering=*/false);
-  VPlanTransforms::handleEarlyExits(*Plan,
-                                    /*HasUncountableExit*/ false);
-  VPlanTransforms::addMiddleCheck(*Plan, /*RequiresScalarEpilogue*/ true,
-                                  /*TailFolded*/ false);
-
-  VPlanTransforms::createLoopRegions(*Plan);
 
   for (ElementCount VF : Range)
     Plan->addVF(VF);
@@ -8992,35 +8912,9 @@ LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
                               !EnableLoopVectorization) {}
 
 /// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
-/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
-/// don't have a corresponding wide induction in \p EpiPlan.
-static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
-  // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
-  // will need their resume-values computed in the main vector loop. Others
-  // can be removed from the main VPlan.
-  SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
-  for (VPRecipeBase &R :
-       EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
-    if (isa<VPCanonicalIVPHIRecipe>(&R))
-      continue;
-    EpiWidenedPhis.insert(
-        cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
-  }
-  for (VPRecipeBase &R :
-       make_early_inc_range(MainPlan.getScalarHeader()->phis())) {
-    auto *VPIRInst = cast<VPIRPhi>(&R);
-    if (EpiWidenedPhis.contains(&VPIRInst->getIRPhi()))
-      continue;
-    // There is no corresponding wide induction in the epilogue plan that would
-    // need a resume value. Remove the VPIRInst wrapping the scalar header phi
-    // together with the corresponding ResumePhi. The resume values for the
-    // scalar loop will be created during execution of EpiPlan.
-    VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();
-    VPIRInst->eraseFromParent();
-    ResumePhi->eraseFromParent();
-  }
-  RUN_VPLAN_PASS(VPlanTransforms::removeDeadRecipes, MainPlan);
-
+/// vectorization.
+static SmallVector<VPInstruction *>
+preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
   using namespace VPlanPatternMatch;
   // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
   // introduce multiple uses of undef/poison. If the reduction start value may
@@ -9070,21 +8964,37 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
       });
   VPPhi *ResumePhi = nullptr;
   if (ResumePhiIter == MainScalarPH->phis().end()) {
+    using namespace llvm::VPlanPatternMatch;
+    assert(
+        match(MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue(),
+              m_ZeroInt()) &&
+        "canonical IV must start at 0");
+    Type *Ty = VPTypeAnalysis(MainPlan).inferScalarType(VectorTC);
     VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
     ResumePhi = ScalarPHBuilder.createScalarPhi(
-        {VectorTC,
-         MainPlan.getVectorLoopRegion()->getCanonicalIV()->getStartValue()},
-        {}, "vec.epilog.resume.val");
+        {VectorTC, MainPlan.getZero(Ty)}, {}, "vec.epilog.resume.val");
   } else {
     ResumePhi = cast<VPPhi>(&*ResumePhiIter);
-    if (MainScalarPH->begin() == MainScalarPH->end())
-      ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->end());
-    else if (&*MainScalarPH->begin() != ResumePhi)
+    ResumePhi->setName("vec.epilog.resume.val");
+    if (&MainScalarPH->front() != ResumePhi)
       ResumePhi->moveBefore(*MainScalarPH, MainScalarPH->begin());
   }
-  // Add a user to to make sure the resume phi won't get removed.
-  VPBuilder(MainScalarPH)
-      .createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi);
+
+  // Create a ResumeForEpilogue for the canonical IV resume as the
+  // first non-phi, to keep it alive for the epilogue.
+  VPBuilder ResumeBuilder(MainScalarPH);
+  ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue, ResumePhi);
+
+  // Create ResumeForEpilogue instructions for the resume phis of the
+  // VPIRPhis in the scalar header of the main plan and return them so they can
+  // be used as resume values when vectorizing the epilogue.
+  return to_vector(
+      map_range(MainPlan.getScalarHeader()->phis(), [&](VPRecipeBase &R) {
+        assert(isa<VPIRPhi>(R) &&
+               "only VPIRPhis expected in the scalar header");
+        return ResumeBuilder.createNaryOp(VPInstruction::ResumeForEpilogue,
+                                          R.getOperand(0));
+      }));
 }
 
 /// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
@@ -9292,39 +9202,11 @@ static SmallVector<Instruction *> preparePlanForEpilogueVectorLoop(
   return InstsToMove;
 }
 
-// Generate bypass values from the additional bypass block. Note that when the
-// vectorized epilogue is skipped due to iteration count check, then the
-// resume value for the induction variable comes from the trip count of the
-// main vector loop, passed as the second argument.
-static Value *createInductionAdditionalBypassValues(
-    PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
-    const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
-    Instruction *OldInduction) {
-  Value *Step = getExpandedStep(II, ExpandedSCEVs);
-  // For the primary induction the additional bypass end value is known.
-  // Otherwise it is computed.
-  Value *EndValueFromAdditionalBypass = MainVectorTripCount;
-  if (OrigPhi != OldInduction) {
-    auto *BinOp = II.getInductionBinOp();
-    // Fast-math-flags propagate from the original induction instruction.
-    if (isa_and_nonnull<FPMathOperator>(BinOp))
-      BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
-
-    // Compute the end value for the additional bypass.
-    EndValueFromAdditionalBypass =
-        emitTransformedIndex(BypassBuilder, MainVectorTripCount,
-                             II.getStartValue(), Step, II.getKind(), BinOp);
-    EndValueFromAdditionalBypass->setName("ind.end");
-  }
-  return EndValueFromAdditionalBypass;
-}
-
-static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
-                                            VPlan &BestEpiPlan,
-                                            LoopVectorizationLegality &LVL,
-                                            const SCEV2ValueTy &ExpandedSCEVs,
-                                            Value *MainVectorTripCount) {
-  // Fix reduction resume values from the additional bypass block.
+static void
+fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
+                                VPlan &BestEpiPlan,
+                                ArrayRef<VPInstruction *> ResumeValues) {
+  // Fix resume values from the additional bypass block.
   BasicBlock *PH = L->getLoopPreheader();
   for (auto *Pred : predecessors(PH)) {
     for (PHINode &Phi : PH->phis()) {
@@ -9335,59 +9217,41 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
   }
   auto *ScalarPH = cast<VPIRBasicBlock>(BestEpiPlan.getScalarPreheader());
   if (ScalarPH->hasPredecessors()) {
-    // If ScalarPH has predecessors, we may need to update its reduction
-    // resume values.
-    for (const auto &[R, IRPhi] :
-         zip(ScalarPH->phis(), ScalarPH->getIRBasicBlock()->phis())) {
-      fixReductionScalarResumeWhenVectorizingEpilog(cast<VPPhi>(&R), IRPhi,
-                                                    BypassBlock);
-    }
-  }
-
-  // Fix induction resume values from the additional bypass block.
-  IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
-  for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
-    Value *V = createInductionAdditionalBypassValues(
-        IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
-        LVL.getPrimaryInduction());
-    // TODO: Directly add as extra operand to the VPResumePHI recipe.
-    if (auto *Inc = dyn_cast<PHINode>(IVPhi->getIncomingValueForBlock(PH))) {
-      if (Inc->getBasicBlockIndex(BypassBlock) != -1)
-        Inc->setIncomingValueForBlock(BypassBlock, V);
-    } else {
-      // If the resume value in the scalar preheader was simplified (e.g., when
-      // narrowInterleaveGroups optimized away the resume PHIs), create a new
-      // PHI to merge the bypass value with the original value.
-      Value *OrigVal = IVPhi->getIncomingValueForBlock(PH);
-      PHINode *NewPhi =
-          PHINode::Create(IVPhi->getType(), pred_size(PH), "bc.resume.val",
-                          PH->getFirstNonPHIIt());
-      for (auto *Pred : predecessors(PH)) {
-        if (Pred == BypassBlock)
-          NewPhi->addIncoming(V, Pred);
-        else
-          NewPhi->addIncoming(OrigVal, Pred);
-      }
-      IVPhi->setIncomingValueForBlock(PH, NewPhi);
+    // Fix resume values for inductions and reductions from the additional
+    // bypass block using the incoming values from the main loop's resume phis.
+    // ResumeValues correspond 1:1 with the scalar loop header phis.
+    for (auto [ResumeV, HeaderPhi] :
+         zip(ResumeValues, BestEpiPlan.getScalarHeader()->phis())) {
+      auto *HeaderPhiR = cast<VPIRPhi>(&HeaderPhi);
+      if (isa<VPIRValue>(HeaderPhiR->getIncomingValueForBlock(ScalarPH)))
+        continue;
+      auto *EpiResumePhi =
+          cast<PHINode>(HeaderPhiR->getIRPhi().getIncomingValueForBlock(PH));
+      if (EpiResumePhi->getBasicBlockIndex(BypassBlock) == -1)
+        continue;
+      auto *MainResumePhi = cast<PHINode>(ResumeV->getUnderlyingValue());
+      EpiResumePhi->setIncomingValueForBlock(
+          BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));
     }
   }
 }
 
 /// Connect the epilogue vector loop generated for \p EpiPlan to the main vector
-// loop, after both plans have executed, updating branches from the iteration
-// and runtime checks of the main loop, as well as updating various phis. \p
-// InstsToMove contains instructions that need to be moved to the preheader of
-// the epilogue vector loop.
-static void connectEpilogueVectorLoop(
-    VPlan &EpiPlan, Loop *L, EpilogueLoopVectorizationInfo &EPI,
-    DominatorTree *DT, LoopVectorizationLegality &LVL,
-    DenseMap<const SCEV *, Value *> &ExpandedSCEVs, GeneratedRTChecks &Checks,
-    ArrayRef<Instruction *> InstsToMove) {
+/// loop, after both plans have executed, updating branches from the iteration
+/// and runtime checks of the main loop, as well as updating various phis. \p
+/// InstsToMove contains instructions that need to be moved to the preheader of
+/// the epilogue vector loop.
+static void connectEpilogueVectorLoop(VPlan &EpiPlan, Loop *L,
+                                      EpilogueLoopVectorizationInfo &EPI,
+                                      DominatorTree *DT,
+                                      GeneratedRTChecks &Checks,
+                                      ArrayRef<Instruction *> InstsToMove,
+                                      ArrayRef<VPInstruction *> ResumeValues) {
   BasicBlock *VecEpilogueIterationCountCheck =
       cast<VPIRBasicBlock>(EpiPlan.getEntry())->getIRBasicBlock();
 
   BasicBlock *VecEpiloguePreHeader =
-      cast<BranchInst>(VecEpilogueIterationCountCheck->getTerminator())
+      cast<CondBrInst>(VecEpilogueIterationCountCheck->getTerminator())
           ->getSuccessor(1);
   // Adjust the control flow taking the state info from the main loop
   // vectorization into account.
@@ -9465,7 +9329,13 @@ static void connectEpilogueVectorLoop(
   // after executing the main loop. We need to update the resume values of
   // inductions and reductions during epilogue vectorization.
   fixScalarResumeValuesFromBypass(VecEpilogueIterationCountCheck, L, EpiPlan,
-                                  LVL, ExpandedSCEVs, EPI.VectorTripCount);
+                                  ResumeValues);
+
+  // Remove dead phis that were moved to the epilogue preheader but are unused
+  // (e.g., resume phis for inductions not widened in the epilogue vector loop).
+  for (PHINode &Phi : make_early_inc_range(VecEpiloguePreHeader->phis()))
+    if (Phi.use_empty())
+      Phi.eraseFromParent();
 }
 
 bool LoopVectorizePass::processLoop(Loop *L) {
@@ -9852,7 +9722,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);
     BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
     BestEpiPlan.getVectorPreheader()->setName("vec.epilog.ph");
-    preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
+    SmallVector<VPInstruction *> ResumeValues =
+        preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);
     EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
                                       BestEpiPlan);
     EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TTI, AC, EPI, &CM,
@@ -9869,8 +9740,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         BestEpiPlan, L, ExpandedSCEVs, EPI, CM, *PSE.getSE());
     LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV, DT,
                     true);
-    connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, LVL, ExpandedSCEVs,
-                              Checks, InstsToMove);
+    connectEpilogueVectorLoop(BestEpiPlan, L, EPI, DT, Checks, InstsToMove,
+                              ResumeValues);
     ++LoopsEpilogueVectorized;
   } else {
     InnerLoopVectorizer LB(L, PSE, LI, DT, TTI, AC, VF.Width, IC, &CM, Checks,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 97ba8eee6742c..97866783f62af 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13267,8 +13267,6 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
   if (ScalarTy->isVectorTy())
     return false;
   const unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
-  if (!isPowerOf2_64(Sz))
-    return false;
   const TreeEntry *LhsTE = getOperandEntry(&TE, /*Idx=*/0);
   const TreeEntry *RhsTE = getOperandEntry(&TE, /*Idx=*/1);
   // Lhs should be zext i<stride> to I<sz>.
@@ -13280,7 +13278,8 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
     return false;
   Type *SrcScalarTy = cast<ZExtInst>(LhsTE->getMainOp())->getSrcTy();
   unsigned Stride = DL->getTypeSizeInBits(SrcScalarTy);
-  if (!isPowerOf2_64(Stride) || Stride >= Sz)
+  if (!isPowerOf2_64(Stride) || Stride >= Sz || Sz % Stride != 0 ||
+      !isPowerOf2_64(LhsTE->getVectorFactor()))
     return false;
   if (!(RhsTE->isGather() && RhsTE->ReorderIndices.empty() &&
         RhsTE->ReuseShuffleIndices.empty() && !MinBWs.contains(RhsTE)))
@@ -13332,6 +13331,8 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
       return false;
   }
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  auto *SrcType = IntegerType::getIntNTy(ScalarTy->getContext(),
+                                         Stride * LhsTE->getVectorFactor());
   FastMathFlags FMF;
   SmallPtrSet<Value *, 4> CheckedExtracts;
   auto *VecTy = getWidenedType(ScalarTy, TE.getVectorFactor());
@@ -13347,7 +13348,7 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
           getWidenedType(SrcScalarTy, LhsTE->getVectorFactor()), CastCtx,
           CostKind);
   InstructionCost BitcastCost = TTI->getCastInstrCost(
-      Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx, CostKind);
+      Instruction::BitCast, SrcType, SrcVecTy, CastCtx, CostKind);
   if (!Order.empty()) {
     fixupOrderingIndices(Order);
     SmallVector<int> Mask;
@@ -13359,9 +13360,9 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
   constexpr unsigned ByteSize = 8;
   if (!Order.empty() && isReverseOrder(Order) &&
       DL->getTypeSizeInBits(SrcScalarTy) == ByteSize) {
-    IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, ScalarTy, {ScalarTy});
+    IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
     InstructionCost BSwapCost =
-        TTI->getCastInstrCost(Instruction::BitCast, ScalarTy, SrcVecTy, CastCtx,
+        TTI->getCastInstrCost(Instruction::BitCast, SrcType, SrcVecTy, CastCtx,
                               CostKind) +
         TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
     if (BSwapCost <= BitcastCost) {
@@ -13375,10 +13376,9 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
           SrcTE->getOpcode() == Instruction::Load && !SrcTE->isAltShuffle() &&
           all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
         auto *LI = cast<LoadInst>(SrcTE->getMainOp());
-        IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, ScalarTy,
-                                          {ScalarTy});
+        IntrinsicCostAttributes CostAttrs(Intrinsic::bswap, SrcType, {SrcType});
         InstructionCost BSwapCost =
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, LI->getAlign(),
+            TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
                                  LI->getPointerAddressSpace(), CostKind) +
             TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
         if (BSwapCost <= BitcastCost) {
@@ -13399,7 +13399,7 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
         all_of(SrcTE->Scalars, [](Value *V) { return V->hasOneUse(); })) {
       auto *LI = cast<LoadInst>(SrcTE->getMainOp());
       BitcastCost =
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, LI->getAlign(),
+          TTI->getMemoryOpCost(Instruction::Load, SrcType, LI->getAlign(),
                                LI->getPointerAddressSpace(), CostKind);
       VecCost +=
           TTI->getMemoryOpCost(Instruction::Load, SrcVecTy, LI->getAlign(),
@@ -13407,6 +13407,10 @@ bool BoUpSLP::matchesShlZExt(const TreeEntry &TE, OrdersType &Order,
       ForLoads = true;
     }
   }
+  if (SrcType != ScalarTy) {
+    BitcastCost += TTI->getCastInstrCost(Instruction::ZExt, ScalarTy, SrcType,
+                                         TTI::CastContextHint::None, CostKind);
+  }
   return BitcastCost < VecCost;
 }
 
@@ -22325,11 +22329,17 @@ Value *BoUpSLP::vectorizeTree(
     IRBuilder<>::InsertPointGuard Guard(Builder);
     Builder.SetInsertPoint(ReductionRoot->getParent(),
                            ReductionRoot->getIterator());
-    Vec = Builder.CreateIntCast(
-        Vec,
-        VectorType::get(Builder.getIntNTy(ReductionBitWidth),
-                        cast<VectorType>(Vec->getType())->getElementCount()),
-        It->second.second);
+    if (isReducedBitcastRoot() || isReducedCmpBitcastRoot()) {
+      Vec = Builder.CreateIntCast(Vec, Builder.getIntNTy(ReductionBitWidth),
+                                  It->second.second);
+
+    } else {
+      Vec = Builder.CreateIntCast(
+          Vec,
+          VectorType::get(Builder.getIntNTy(ReductionBitWidth),
+                          cast<VectorType>(Vec->getType())->getElementCount()),
+          It->second.second);
+    }
   }
   return Vec;
 }
@@ -28333,6 +28343,8 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
         const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
         for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
           auto *GEPJ = GEPList[J];
+          if (!Candidates.count(GEPJ))
+            continue;
           const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
           if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
             Candidates.remove(GEPI);
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c193477f3d356..da631984a9a3c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -428,16 +428,15 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
     auto *PredBBTerminator = PredBB->getTerminator();
     LLVM_DEBUG(dbgs() << "LV: draw edge from " << PredBB->getName() << '\n');
 
-    auto *TermBr = dyn_cast<BranchInst>(PredBBTerminator);
     if (isa<UnreachableInst>(PredBBTerminator)) {
       assert(PredVPSuccessors.size() == 1 &&
              "Predecessor ending w/o branch must have single successor.");
       DebugLoc DL = PredBBTerminator->getDebugLoc();
       PredBBTerminator->eraseFromParent();
-      auto *Br = BranchInst::Create(NewBB, PredBB);
+      auto *Br = UncondBrInst::Create(NewBB, PredBB);
       Br->setDebugLoc(DL);
-    } else if (TermBr && !TermBr->isConditional()) {
-      TermBr->setSuccessor(0, NewBB);
+    } else if (auto *UBI = dyn_cast<UncondBrInst>(PredBBTerminator)) {
+      UBI->setSuccessor(NewBB);
     } else {
       // Set each forward successor here when it is created, excluding
       // backedges. A backward successor is set when the branch is created.
@@ -447,10 +446,11 @@ void VPBasicBlock::connectToPredecessors(VPTransformState &State) {
       // TODO: Remove exception by modeling the terminator of entry block using
       // BranchOnCond.
       unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
-      assert((TermBr && (!TermBr->getSuccessor(idx) ||
-                         (isa<VPIRBasicBlock>(this) &&
-                          (TermBr->getSuccessor(idx) == NewBB ||
-                           PredVPBlock == getPlan()->getEntry())))) &&
+      auto *TermBr = cast<CondBrInst>(PredBBTerminator);
+      assert((!TermBr->getSuccessor(idx) ||
+              (isa<VPIRBasicBlock>(this) &&
+               (TermBr->getSuccessor(idx) == NewBB ||
+                PredVPBlock == getPlan()->getEntry()))) &&
              "Trying to reset an existing successor block.");
       TermBr->setSuccessor(idx, NewBB);
     }
@@ -475,9 +475,9 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
     Br->setOperand(0, nullptr);
     IRBB->getTerminator()->eraseFromParent();
   } else {
-    assert(
-        (getNumSuccessors() == 0 || isa<BranchInst>(IRBB->getTerminator())) &&
-        "other blocks must be terminated by a branch");
+    assert((getNumSuccessors() == 0 ||
+            isa<UncondBrInst, CondBrInst>(IRBB->getTerminator())) &&
+           "other blocks must be terminated by a branch");
   }
 
   connectToPredecessors(*State);
@@ -664,7 +664,7 @@ void VPBlockBase::print(raw_ostream &O) const {
 }
 
 void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const {
-  if (getSuccessors().empty()) {
+  if (!hasSuccessors()) {
     O << Indent << "No successors\n";
   } else {
     O << Indent << "Successor(s): ";
@@ -936,7 +936,7 @@ void VPlan::execute(VPTransformState *State) {
 
   // Disconnect VectorPreHeader from ExitBB in both the CFG and DT.
   BasicBlock *VectorPreHeader = State->CFG.PrevBB;
-  cast<BranchInst>(VectorPreHeader->getTerminator())->setSuccessor(0, nullptr);
+  cast<UncondBrInst>(VectorPreHeader->getTerminator())->setSuccessor(nullptr);
   State->CFG.DTU.applyUpdates(
       {{DominatorTree::Delete, VectorPreHeader, State->CFG.ExitBB}});
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ce19e41f5d889..80df058dfcf66 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -199,6 +199,11 @@ class LLVM_ABI_FOR_TEST VPBlockBase {
   const VPBlocksTy &getSuccessors() const { return Successors; }
   VPBlocksTy &getSuccessors() { return Successors; }
 
+  /// Returns true if this block has any successors.
+  bool hasSuccessors() const { return !Successors.empty(); }
+  /// Returns true if this block has any predecessors.
+  bool hasPredecessors() const { return !Predecessors.empty(); }
+
   iterator_range<VPBlockBase **> successors() { return Successors; }
   iterator_range<VPBlockBase **> predecessors() { return Predecessors; }
 
@@ -220,9 +225,6 @@ class LLVM_ABI_FOR_TEST VPBlockBase {
   size_t getNumSuccessors() const { return Successors.size(); }
   size_t getNumPredecessors() const { return Predecessors.size(); }
 
-  /// Returns true if this block has any predecessors.
-  bool hasPredecessors() const { return !Predecessors.empty(); }
-
   /// An Enclosing Block of a block B is any block containing B, including B
   /// itself. \return the closest enclosing block starting from "this", which
   /// has successors. \return the root enclosing block if all enclosing blocks
@@ -4441,14 +4443,14 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
                 const std::string &Name = "", bool IsReplicator = false)
       : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
         IsReplicator(IsReplicator) {
-    assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
-    assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
-    Entry->setParent(this);
-    Exiting->setParent(this);
+    if (Entry) {
+      assert(!Entry->hasPredecessors() && "Entry block has predecessors.");
+      assert(Exiting && "Must also pass Exiting if Entry is passed.");
+      assert(!Exiting->hasSuccessors() && "Exit block has successors.");
+      Entry->setParent(this);
+      Exiting->setParent(this);
+    }
   }
-  VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
-      : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
-        IsReplicator(IsReplicator) {}
 
 public:
   ~VPRegionBlock() override = default;
@@ -4464,7 +4466,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
   /// EntryBlock must have no predecessors.
   void setEntry(VPBlockBase *EntryBlock) {
-    assert(EntryBlock->getPredecessors().empty() &&
+    assert(!EntryBlock->hasPredecessors() &&
            "Entry block cannot have predecessors.");
     Entry = EntryBlock;
     EntryBlock->setParent(this);
@@ -4476,7 +4478,7 @@ class LLVM_ABI_FOR_TEST VPRegionBlock : public VPBlockBase {
   /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
   /// ExitingBlock must have no successors.
   void setExiting(VPBlockBase *ExitingBlock) {
-    assert(ExitingBlock->getSuccessors().empty() &&
+    assert(!ExitingBlock->hasSuccessors() &&
            "Exit block cannot have successors.");
     Exiting = ExitingBlock;
     ExitingBlock->setParent(this);
@@ -4899,8 +4901,7 @@ class VPlan {
   VPRegionBlock *createLoopRegion(const std::string &Name = "",
                                   VPBlockBase *Entry = nullptr,
                                   VPBlockBase *Exiting = nullptr) {
-    auto *VPB = Entry ? new VPRegionBlock(Entry, Exiting, Name)
-                      : new VPRegionBlock(Name);
+    auto *VPB = new VPRegionBlock(Entry, Exiting, Name);
     CreatedBlocks.push_back(VPB);
     return VPB;
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 55f2a626c293a..2289b4083ef14 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -455,25 +455,21 @@ SmallVector<VPRegisterUsage, 8> llvm::calculateRegisterUsageForPlan(
 
       // Save the end location of each USE.
       for (VPValue *U : R.operands()) {
-        auto *DefR = U->getDefiningRecipe();
-
-        // Ignore non-recipe values such as arguments, constants, etc.
-        // FIXME: Might need some motivation why these values are ignored. If
-        // for example an argument is used inside the loop it will increase the
-        // register pressure (so shouldn't we add it to LoopInvariants).
-        auto *IRV = dyn_cast<VPIRValue>(U);
-        if (!DefR && (!IRV || !isa<Instruction>(IRV->getValue())))
-          continue;
-
-        // If this recipe is outside the loop then record it and continue.
-        if (!DefR) {
+        if (isa<VPRecipeValue>(U)) {
+          // Overwrite previous end points.
+          EndPoint[U] = Idx2Recipe.size();
+          Ends.insert(U);
+        } else if (auto *IRV = dyn_cast<VPIRValue>(U)) {
+          // Ignore non-recipe values such as arguments, constants, etc.
+          // FIXME: Might need some motivation why these values are ignored. If
+          // for example an argument is used inside the loop it will increase
+          // the register pressure (so shouldn't we add it to LoopInvariants).
+          if (!isa<Instruction>(IRV->getValue()))
+            continue;
+          // This recipe is outside the loop, record it and continue.
           LoopInvariants.insert(U);
-          continue;
         }
-
-        // Overwrite previous end points.
-        EndPoint[U] = Idx2Recipe.size();
-        Ends.insert(U);
+        // Other types of VPValue are currently not tracked.
       }
     }
     if (VPBB == LoopRegion->getExiting()) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 468193d9e10eb..d755851eca44a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -192,16 +192,16 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
     assert(!IRDef2VPValue.count(Inst) &&
            "Instruction shouldn't have been visited.");
 
-    if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+    if (isa<UncondBrInst>(Inst))
+      // Skip the rest of the Instruction processing for Branch instructions.
+      continue;
+
+    if (auto *Br = dyn_cast<CondBrInst>(Inst)) {
       // Conditional branch instruction are represented using BranchOnCond
       // recipes.
-      if (Br->isConditional()) {
-        VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst, {},
-                                 VPIRMetadata(*Inst), Inst->getDebugLoc());
-      }
-
-      // Skip the rest of the Instruction processing for Branch instructions.
+      VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
+      VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst, {},
+                               VPIRMetadata(*Inst), Inst->getDebugLoc());
       continue;
     }
 
@@ -330,15 +330,11 @@ std::unique_ptr<VPlan> PlainCFGBuilder::buildPlainCFG() {
       VPBB->setSuccessors(Succs);
       continue;
     }
-    auto *BI = cast<BranchInst>(BB->getTerminator());
-    unsigned NumSuccs = succ_size(BB);
-    if (NumSuccs == 1) {
-      VPBB->setOneSuccessor(getOrCreateVPBB(BB->getSingleSuccessor()));
+    if (auto *BI = dyn_cast<UncondBrInst>(BB->getTerminator())) {
+      VPBB->setOneSuccessor(getOrCreateVPBB(BI->getSuccessor()));
       continue;
     }
-    assert(BI->isConditional() && NumSuccs == 2 && BI->isConditional() &&
-           "block must have conditional branch with 2 successors");
-
+    auto *BI = cast<CondBrInst>(BB->getTerminator());
     BasicBlock *IRSucc0 = BI->getSuccessor(0);
     BasicBlock *IRSucc1 = BI->getSuccessor(1);
     VPBasicBlock *Successor0 = getOrCreateVPBB(IRSucc0);
@@ -706,12 +702,7 @@ void VPlanTransforms::createHeaderPhiRecipes(
     const MapVector<PHINode *, RecurrenceDescriptor> &Reductions,
     const SmallPtrSetImpl<const PHINode *> &FixedOrderRecurrences,
     const SmallPtrSetImpl<PHINode *> &InLoopReductions, bool AllowReordering) {
-  // Retrieve the header manually from the intial plain-CFG VPlan.
-  VPBasicBlock *HeaderVPBB = cast<VPBasicBlock>(
-      Plan.getEntry()->getSuccessors()[1]->getSingleSuccessor());
-  assert(VPDominatorTree(Plan).dominates(HeaderVPBB,
-                                         HeaderVPBB->getPredecessors()[1]) &&
-         "header must dominate its latch");
+  VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
 
   auto CreateHeaderPhiRecipe = [&](VPPhi *PhiR) -> VPHeaderPHIRecipe * {
     // TODO: Gradually replace uses of underlying instruction by analyses on
@@ -755,9 +746,9 @@ void VPlanTransforms::createHeaderPhiRecipes(
         RdxDesc.hasUsesOutsideReductionChain());
   };
 
-  for (VPRecipeBase &R : make_early_inc_range(HeaderVPBB->phis())) {
-    if (isa<VPCanonicalIVPHIRecipe>(&R))
-      continue;
+  assert(isa<VPCanonicalIVPHIRecipe>(HeaderVPBB->front()) &&
+         "first recipe must be canonical IV phi");
+  for (VPRecipeBase &R : make_early_inc_range(drop_begin(HeaderVPBB->phis()))) {
     auto *PhiR = cast<VPPhi>(&R);
     VPHeaderPHIRecipe *HeaderPhiR = CreateHeaderPhiRecipe(PhiR);
     HeaderPhiR->insertBefore(PhiR);
@@ -971,9 +962,7 @@ void VPlanTransforms::handleEarlyExits(VPlan &Plan,
   }
 }
 
-void VPlanTransforms::addMiddleCheck(VPlan &Plan,
-                                     bool RequiresScalarEpilogueCheck,
-                                     bool TailFolded) {
+void VPlanTransforms::addMiddleCheck(VPlan &Plan, bool TailFolded) {
   auto *MiddleVPBB = cast<VPBasicBlock>(
       Plan.getScalarHeader()->getSinglePredecessor()->getPredecessors()[0]);
   // If MiddleVPBB has a single successor then the original loop does not exit
@@ -1006,9 +995,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
   DebugLoc LatchDL = LatchVPBB->getTerminator()->getDebugLoc();
   VPBuilder Builder(MiddleVPBB);
   VPValue *Cmp;
-  if (!RequiresScalarEpilogueCheck)
-    Cmp = Plan.getFalse();
-  else if (TailFolded)
+  if (TailFolded)
     Cmp = Plan.getTrue();
   else
     Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index f8fa7d3c44ce1..401f6725677e3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1107,7 +1107,7 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
     }
 
     Type *CondTy = Ctx.Types.inferScalarType(getOperand(0));
-    if (!IsScalarCond)
+    if (!IsScalarCond && VF.isVector())
       CondTy = VectorType::get(CondTy, VF);
 
     llvm::CmpPredicate Pred;
@@ -1321,6 +1321,12 @@ void VPInstruction::execute(VPTransformState &State) {
          "scalar value but not only first lane defined");
   State.set(this, GeneratedValue,
             /*IsScalar*/ GeneratesPerFirstLaneOnly);
+  if (getOpcode() == VPInstruction::ResumeForEpilogue) {
+    // FIXME: This is a workaround to enable reliable updates of the scalar loop
+    // resume phis, when vectorizing the epilogue. Must be removed once epilogue
+    // vectorization explicitly connects VPlans.
+    setUnderlyingValue(GeneratedValue);
+  }
 }
 
 bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
@@ -3614,6 +3620,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
   case Instruction::UIToFP:
   case Instruction::Trunc:
   case Instruction::FPTrunc:
+  case Instruction::Select:
   case Instruction::AddrSpaceCast: {
     return getCostForRecipeWithOpcode(getOpcode(), ElementCount::getFixed(1),
                                       Ctx) *
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 114917c75a0ca..d389db07885c8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -230,6 +230,43 @@ canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
   return true;
 }
 
+/// Collect either replicated Loads or Stores grouped by their address SCEV.
+template <unsigned Opcode>
+static SmallVector<SmallVector<VPReplicateRecipe *, 4>>
+collectGroupedReplicateMemOps(
+    VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L,
+    function_ref<bool(VPReplicateRecipe *)> FilterFn) {
+  static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
+                "Only Load and Store opcodes supported");
+  constexpr bool IsLoad = (Opcode == Instruction::Load);
+  SmallDenseMap<const SCEV *, SmallVector<VPReplicateRecipe *, 4>>
+      RecipesByAddress;
+  for (VPBlockBase *Block :
+       vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry())) {
+    auto *VPBB = cast<VPBasicBlock>(Block);
+    for (VPRecipeBase &R : *VPBB) {
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
+      if (!RepR || RepR->getOpcode() != Opcode || !FilterFn(RepR))
+        continue;
+
+      // For loads, operand 0 is address; for stores, operand 1 is address.
+      VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
+      const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
+      if (!isa<SCEVCouldNotCompute>(AddrSCEV))
+        RecipesByAddress[AddrSCEV].push_back(RepR);
+    }
+  }
+  auto Groups = to_vector(RecipesByAddress.values());
+  VPDominatorTree VPDT(Plan);
+  for (auto &Group : Groups) {
+    // Sort mem ops by dominance order, with earliest (most dominating) first.
+    stable_sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
+      return VPDT.properlyDominates(A, B);
+    });
+  }
+  return Groups;
+}
+
 /// Return true if we do not know how to (mechanically) hoist or sink \p R out
 /// of a loop region.
 static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R) {
@@ -1373,6 +1410,16 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
     return Def->replaceAllUsesWith(
         Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
 
+  // x && (x && y) -> x && y
+  if (match(Def, m_LogicalAnd(m_VPValue(X),
+                              m_LogicalAnd(m_Deferred(X), m_VPValue()))))
+    return Def->replaceAllUsesWith(Def->getOperand(1));
+
+  // x && (y && x) -> x && y
+  if (match(Def, m_LogicalAnd(m_VPValue(X),
+                              m_LogicalAnd(m_VPValue(Y), m_Deferred(X)))))
+    return Def->replaceAllUsesWith(Builder.createLogicalAnd(X, Y));
+
   // x && !x -> 0
   if (match(Def, m_LogicalAnd(m_VPValue(X), m_Not(m_Deferred(X)))))
     return Def->replaceAllUsesWith(Plan->getFalse());
@@ -4723,33 +4770,17 @@ collectComplementaryPredicatedMemOps(VPlan &Plan,
   static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
                 "Only Load and Store opcodes supported");
   constexpr bool IsLoad = (Opcode == Instruction::Load);
-  VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
-  VPDominatorTree VPDT(Plan);
   VPTypeAnalysis TypeInfo(Plan);
 
-  // Group predicated operations by their address SCEV.
-  DenseMap<const SCEV *, SmallVector<VPReplicateRecipe *>> RecipesByAddress;
-  for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
-    auto *VPBB = cast<VPBasicBlock>(Block);
-    for (VPRecipeBase &R : *VPBB) {
-      auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
-      if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
-        continue;
-
-      // For loads, operand 0 is address; for stores, operand 1 is address.
-      VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
-      const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
-      if (!isa<SCEVCouldNotCompute>(AddrSCEV))
-        RecipesByAddress[AddrSCEV].push_back(RepR);
-    }
-  }
-
   // For each address, collect operations with the same or complementary masks.
   SmallVector<SmallVector<VPReplicateRecipe *, 4>> AllGroups;
   auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
     return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
   };
-  for (auto &[Addr, Recipes] : RecipesByAddress) {
+  auto Groups = collectGroupedReplicateMemOps<Opcode>(
+      Plan, PSE, L,
+      [](VPReplicateRecipe *RepR) { return RepR->isPredicated(); });
+  for (auto Recipes : Groups) {
     if (Recipes.size() < 2)
       continue;
 
@@ -4784,11 +4815,6 @@ collectComplementaryPredicatedMemOps(VPlan &Plan,
 
       if (HasComplementaryMask) {
         assert(Group.size() >= 2 && "must have at least 2 entries");
-        // Sort replicates by dominance order, with earliest (most dominating)
-        // first.
-        sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
-          return VPDT.properlyDominates(A, B);
-        });
         AllGroups.push_back(std::move(Group));
       }
     }
@@ -5218,7 +5244,7 @@ VPlanTransforms::expandSCEVs(VPlan &Plan, ScalarEvolution &SE) {
 /// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
 /// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
 /// is defined at \p Idx of a load interleave group.
-static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
+static bool canNarrowLoad(VPSingleDefRecipe *WideMember0, unsigned OpIdx,
                           VPValue *OpV, unsigned Idx, bool IsScalable) {
   VPValue *Member0Op = WideMember0->getOperand(OpIdx);
   VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
@@ -5236,14 +5262,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
 
 static bool canNarrowOps(ArrayRef<VPValue *> Ops, bool IsScalable) {
   SmallVector<VPValue *> Ops0;
-  auto *WideMember0 = dyn_cast<VPWidenRecipe>(Ops[0]);
+  auto *WideMember0 = dyn_cast<VPSingleDefRecipe>(Ops[0]);
   if (!WideMember0)
     return false;
-
-  for (const auto &[_, V] : enumerate(Ops)) {
-    auto *R = dyn_cast<VPWidenRecipe>(V);
-    if (!R || R->getOpcode() != WideMember0->getOpcode() ||
-        R->getNumOperands() > 2)
+  for (VPValue *V : Ops) {
+    if (!isa<VPWidenRecipe, VPWidenCastRecipe>(V))
+      return false;
+    auto *R = cast<VPSingleDefRecipe>(V);
+    if (getOpcodeOrIntrinsicID(R) != getOpcodeOrIntrinsicID(WideMember0))
       return false;
   }
 
@@ -5333,7 +5359,8 @@ narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl<VPValue *> &NarrowedOps) {
   if (isAlreadyNarrow(V))
     return V;
 
-  if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
+  if (isa<VPWidenRecipe, VPWidenCastRecipe>(R)) {
+    auto *WideMember0 = cast<VPSingleDefRecipe>(R);
     for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
       WideMember0->setOperand(
           Idx,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 5f060b32da847..d10ef23dd05b2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -155,9 +155,7 @@ struct VPlanTransforms {
 
   /// If a check is needed to guard executing the scalar epilogue loop, it will
   /// be added to the middle block.
-  LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan,
-                                               bool RequiresScalarEpilogueCheck,
-                                               bool TailFolded);
+  LLVM_ABI_FOR_TEST static void addMiddleCheck(VPlan &Plan, bool TailFolded);
 
   // Create a check to \p Plan to see if the vector loop should be executed.
   static void addMinimumIterationCheck(
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 6614e7cc3a7e9..ae6fb8e3e9cf5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -410,9 +410,8 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
   }
 
   if (VPRegionBlock *EnclosingRegion = VPBB->getEnclosingLoopRegion()) {
-    auto *CanonicalIV = EnclosingRegion->getCanonicalIV();
-    // Canonical IV chain is uniform.
-    if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
+    // Canonical IV is uniform.
+    if (V == EnclosingRegion->getCanonicalIV())
       return true;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index a5692699d9d76..c4cacebcd78ba 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -168,8 +168,7 @@ class VPBlockUtils {
   /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must
   /// have neither successors nor predecessors.
   static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
-    assert(NewBlock->getSuccessors().empty() &&
-           NewBlock->getPredecessors().empty() &&
+    assert(!NewBlock->hasSuccessors() && !NewBlock->hasPredecessors() &&
            "Can't insert new block with predecessors or successors.");
     NewBlock->setParent(BlockPtr->getParent());
     transferSuccessors(BlockPtr, NewBlock);
@@ -181,8 +180,7 @@ class VPBlockUtils {
   /// NewBlock. Add \p NewBlock as predecessor of \p BlockPtr and \p BlockPtr as
   /// successor of \p NewBlock.
   static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
-    assert(NewBlock->getSuccessors().empty() &&
-           NewBlock->getPredecessors().empty() &&
+    assert(!NewBlock->hasSuccessors() && !NewBlock->hasPredecessors() &&
            "Can't insert new block with predecessors or successors.");
     NewBlock->setParent(BlockPtr->getParent());
     for (VPBlockBase *Pred : to_vector(BlockPtr->predecessors())) {
@@ -201,9 +199,8 @@ class VPBlockUtils {
   /// predecessors.
   static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
                                    VPBlockBase *BlockPtr) {
-    assert(IfTrue->getSuccessors().empty() &&
-           "Can't insert IfTrue with successors.");
-    assert(IfFalse->getSuccessors().empty() &&
+    assert(!IfTrue->hasSuccessors() && "Can't insert IfTrue with successors.");
+    assert(!IfFalse->hasSuccessors() &&
            "Can't insert IfFalse with successors.");
     BlockPtr->setTwoSuccessors(IfTrue, IfFalse);
     IfTrue->setPredecessors({BlockPtr});
diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index fba0c7a01f972..3ba32e415d712 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -554,7 +554,7 @@ if(build_runtimes)
 
   # TODO: We need to consider passing it as '-DRUNTIMES_x86_64_LLVM_ENABLE_RUNTIMES'.
   if("libclc" IN_LIST LLVM_ENABLE_RUNTIMES)
-    foreach(dep clang llvm-as llvm-link opt)
+    foreach(dep clang llvm-as llvm-dis llvm-link opt llvm-ar llvm-ranlib)
       if(TARGET ${dep})
         list(APPEND extra_deps ${dep})
       endif()
diff --git a/llvm/test/Analysis/CostModel/AArch64/arith-bf16.ll b/llvm/test/Analysis/CostModel/AArch64/arith-bf16.ll
index d8dbb74bc0dc1..08f0bc89fefdd 100644
--- a/llvm/test/Analysis/CostModel/AArch64/arith-bf16.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/arith-bf16.ll
@@ -21,10 +21,10 @@ define void @fadd_bf16() {
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-B16B16-LABEL: 'fadd_bf16'
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fadd bfloat poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fadd <4 x bfloat> poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fadd <8 x bfloat> poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fadd <16 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fadd bfloat poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fadd <4 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fadd <8 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fadd <16 x bfloat> poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %BF16 = fadd bfloat poison, poison
@@ -50,10 +50,10 @@ define void @fsub_bf16() {
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-B16B16-LABEL: 'fsub_bf16'
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fsub bfloat poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fsub <4 x bfloat> poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fsub <8 x bfloat> poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fsub <16 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fsub bfloat poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fsub <4 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fsub <8 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fsub <16 x bfloat> poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %BF16 = fsub bfloat poison, poison
@@ -79,10 +79,10 @@ define void @fneg_idiom_bf16() {
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-B16B16-LABEL: 'fneg_idiom_bf16'
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fsub bfloat 0xR8000, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:3 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fsub <4 x bfloat> splat (bfloat 0xR8000), poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fsub <8 x bfloat> splat (bfloat 0xR8000), poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fsub <16 x bfloat> splat (bfloat 0xR8000), poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fsub bfloat 0xR8000, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fsub <4 x bfloat> splat (bfloat 0xR8000), poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fsub <8 x bfloat> splat (bfloat 0xR8000), poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fsub <16 x bfloat> splat (bfloat 0xR8000), poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %BF16 = fsub bfloat -0.0, poison
@@ -138,15 +138,15 @@ define void @fmulneg_bf16() {
 ;
 ; CHECK-SVE-B16B16-LABEL: 'fmulneg_bf16'
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fneg bfloat poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:3 SizeLat:1 for: %BF16M = fmul bfloat %BF16, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %BF16M = fmul bfloat %BF16, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2BF16 = fneg <2 x bfloat> poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %V2BF16M = fmul <2 x bfloat> %V2BF16, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2BF16M = fmul <2 x bfloat> %V2BF16, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fneg <4 x bfloat> poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16M = fmul <4 x bfloat> %V4BF16, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16M = fmul <4 x bfloat> %V4BF16, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fneg <8 x bfloat> poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16M = fmul <8 x bfloat> %V8BF16, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16M = fmul <8 x bfloat> %V8BF16, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fneg <16 x bfloat> poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16M = fmul <16 x bfloat> %V16BF16, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16M = fmul <16 x bfloat> %V16BF16, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %BF16 = fneg bfloat poison
@@ -190,15 +190,15 @@ define void @fnegfmul_bf16() {
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-B16B16-LABEL: 'fnegfmul_bf16'
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:3 SizeLat:1 for: %BF16M = fmul bfloat poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %BF16M = fmul bfloat poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fneg bfloat %BF16M
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:1 Lat:3 SizeLat:1 for: %V2BF16M = fmul <2 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2BF16M = fmul <2 x bfloat> poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2BF16 = fneg <2 x bfloat> %V2BF16M
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16M = fmul <4 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16M = fmul <4 x bfloat> poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fneg <4 x bfloat> %V4BF16M
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16M = fmul <8 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16M = fmul <8 x bfloat> poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fneg <8 x bfloat> %V8BF16M
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16M = fmul <16 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16M = fmul <16 x bfloat> poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fneg <16 x bfloat> %V16BF16M
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
@@ -231,10 +231,10 @@ define void @fmul_bf16() {
 ; CHECK-BF16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-SVE-B16B16-LABEL: 'fmul_bf16'
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fmul bfloat poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fmul <4 x bfloat> poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fmul <8 x bfloat> poison, poison
-; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fmul <16 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %BF16 = fmul bfloat poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4BF16 = fmul <4 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8BF16 = fmul <8 x bfloat> poison, poison
+; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16BF16 = fmul <16 x bfloat> poison, poison
 ; CHECK-SVE-B16B16-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %BF16 = fmul bfloat poison, poison
diff --git a/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-delta-signed-min.ll b/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-delta-signed-min.ll
new file mode 100644
index 0000000000000..d089b8fd5dd6b
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/weak-crossing-siv-delta-signed-min.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-ALL
+; RUN: opt < %s -disable-output "-passes=print<da>" -da-enable-dependence-test=weak-crossing-siv 2>&1 \
+; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-WEAK-CROSSING-SIV
+
+; c = (1 << 62);
+; for (i = 0; i < c + 10; i++) {
+;   if (c - 1 <=u i && i <=u c + 1) {
+;     A[i - c] = 0;
+;     A[-i + c] = 1;
+;   }
+; }
+;
+; FIXME: There is a dependency between the two stores in all directions.
+;
+define void @weak_crossing_siv_delta_signed_min(ptr %A) {
+; CHECK-ALL-LABEL: 'weak_crossing_siv_delta_signed_min'
+; CHECK-ALL-NEXT:  Src: store i8 0, ptr %gep.0, align 1 --> Dst: store i8 0, ptr %gep.0, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 0, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+; CHECK-ALL-NEXT:  Src: store i8 1, ptr %gep.1, align 1 --> Dst: store i8 1, ptr %gep.1, align 1
+; CHECK-ALL-NEXT:    da analyze - none!
+;
+; CHECK-WEAK-CROSSING-SIV-LABEL: 'weak_crossing_siv_delta_signed_min'
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 0, ptr %gep.0, align 1 --> Dst: store i8 0, ptr %gep.0, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [*]!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 0, ptr %gep.0, align 1 --> Dst: store i8 1, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - none!
+; CHECK-WEAK-CROSSING-SIV-NEXT:  Src: store i8 1, ptr %gep.1, align 1 --> Dst: store i8 1, ptr %gep.1, align 1
+; CHECK-WEAK-CROSSING-SIV-NEXT:    da analyze - output [*]!
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.latch ]
+  %offset.0 = phi i64 [ -4611686018427387904, %entry ], [ %offset.0.next, %loop.latch ]
+  %offset.1 = phi i64 [ 4611686018427387904, %entry ], [ %offset.1.next, %loop.latch ]
+  %cond.0 = icmp ule i64 4611686018427387903, %i
+  %cond.1 = icmp ule i64 %i, 4611686018427387905
+  %cond = and i1 %cond.0, %cond.1
+  br i1 %cond, label %if.then, label %loop.latch
+
+if.then:
+  %gep.0 = getelementptr i8, ptr %A, i64 %offset.0
+  store i8 0, ptr %gep.0
+  %gep.1 = getelementptr i8, ptr %A, i64 %offset.1
+  store i8 1, ptr %gep.1
+  br label %loop.latch
+
+loop.latch:
+  %i.inc = add i64 %i, 1
+  %offset.0.next = add i64 %offset.0, 1
+  %offset.1.next = add i64 %offset.1, -1
+  %ec = icmp eq i64 %i.inc, 4611686018427387914
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
index a5c1c8e209bf2..cc36415abb00b 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_default_vocab_print.txt
@@ -74,6 +74,7 @@ Key: MetadataTy:  [ 3.50  4.00 ]
 Key: VectorTy:  [ 11.50  12.00 ]
 Key: TokenTy:  [ 5.50  6.00 ]
 Key: IntegerTy:  [ 6.50  7.00 ]
+Key: ByteTy:  [ 0.00  0.00 ]
 Key: FunctionTy:  [ 7.50  8.00 ]
 Key: PointerTy:  [ 8.50  9.00 ]
 Key: StructTy:  [ 9.50  10.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
index ed761e6058210..8d837e9502739 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd1_vocab_print.txt
@@ -74,6 +74,7 @@ Key: MetadataTy:  [ 3.50  4.00 ]
 Key: VectorTy:  [ 11.50  12.00 ]
 Key: TokenTy:  [ 5.50  6.00 ]
 Key: IntegerTy:  [ 6.50  7.00 ]
+Key: ByteTy:  [ 0.00  0.00 ]
 Key: FunctionTy:  [ 7.50  8.00 ]
 Key: PointerTy:  [ 8.50  9.00 ]
 Key: StructTy:  [ 9.50  10.00 ]
diff --git a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
index 8fc27098c383b..3d31b9d9db315 100644
--- a/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
+++ b/llvm/test/Analysis/IR2Vec/Inputs/reference_wtd2_vocab_print.txt
@@ -74,6 +74,7 @@ Key: MetadataTy:  [ 0.00  0.00 ]
 Key: VectorTy:  [ 0.00  0.00 ]
 Key: TokenTy:  [ 0.00  0.00 ]
 Key: IntegerTy:  [ 0.00  0.00 ]
+Key: ByteTy:  [ 0.00  0.00 ]
 Key: FunctionTy:  [ 0.00  0.00 ]
 Key: PointerTy:  [ 0.00  0.00 ]
 Key: StructTy:  [ 0.00  0.00 ]
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/invariant-dep-same-ptr.ll b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dep-same-ptr.ll
new file mode 100644
index 0000000000000..5aeff497466fa
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/invariant-dep-same-ptr.ll
@@ -0,0 +1,343 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
+
+; Store and load to same invariant address through a phi.
+; FIXME: Incorrectly considered safe with runtime checks.
+define void @conditional_store_load_same_invariant_via_phi(ptr %p0, ptr %p1, ptr %p2, i64 %n, i1 %c) {
+; CHECK-LABEL: 'conditional_store_load_same_invariant_via_phi'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:        ptr %p2
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:        ptr %p2
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %p2 High: (4 + %p2))
+; CHECK-NEXT:            Member: %p2
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %phip High: ((4 * %n) + %phip))
+; CHECK-NEXT:            Member: {%phip,+,4}<%loop>
+; CHECK-NEXT:        Group GRP2:
+; CHECK-NEXT:          (Low: %p0 High: ((4 * %n) + %p0))
+; CHECK-NEXT:            Member: {%p0,+,4}<%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %phip = select i1 %c, ptr %p1, ptr %p0
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
+  %gep0 = getelementptr i32, ptr %p0, i64 %iv
+  %x = load i32, ptr %gep0, align 4
+  %c2 = icmp eq i32 %x, 0
+  br i1 %c2, label %latch, label %if
+
+if:
+  store i32 %x, ptr %p2, align 4
+  br label %latch
+
+latch:
+  %phi = phi ptr [ %p2, %if ], [ %p2, %loop ]
+  %y = load i32, ptr %phi, align 4
+  %gep1 = getelementptr i32, ptr %phip, i64 %iv
+  store i32 %y, ptr %gep1, align 4
+  %iv.next = add nuw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Same invariant address via two distinct GEPs.
+; FIXME: Incorrectly considered safe with runtime checks.
+define void @store_load_same_invariant_via_different_geps(ptr %p0, ptr %p1, ptr %base, i64 %n, i1 %c) {
+; CHECK-LABEL: 'store_load_same_invariant_via_different_geps'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.st = getelementptr i32, ptr %base, i64 1
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.st = getelementptr i32, ptr %base, i64 1
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.st = getelementptr i32, ptr %base, i64 1
+; CHECK-NEXT:        Against group GRP3:
+; CHECK-NEXT:          %gep.ld = getelementptr i32, ptr %base, i64 1
+; CHECK-NEXT:      Check 3:
+; CHECK-NEXT:        Comparing group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Check 4:
+; CHECK-NEXT:        Comparing group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:        Against group GRP3:
+; CHECK-NEXT:          %gep.ld = getelementptr i32, ptr %base, i64 1
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: (4 + %base) High: (8 + %base))
+; CHECK-NEXT:            Member: (4 + %base)
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %phip High: ((4 * %n) + %phip))
+; CHECK-NEXT:            Member: {%phip,+,4}<%loop>
+; CHECK-NEXT:        Group GRP2:
+; CHECK-NEXT:          (Low: %p0 High: ((4 * %n) + %p0))
+; CHECK-NEXT:            Member: {%p0,+,4}<%loop>
+; CHECK-NEXT:        Group GRP3:
+; CHECK-NEXT:          (Low: (4 + %base) High: (8 + %base))
+; CHECK-NEXT:            Member: (4 + %base)
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %phip = select i1 %c, ptr %p1, ptr %p0
+  %gep.st = getelementptr i32, ptr %base, i64 1
+  %gep.ld = getelementptr i32, ptr %base, i64 1
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep0 = getelementptr i32, ptr %p0, i64 %iv
+  %x = load i32, ptr %gep0, align 4
+  store i32 %x, ptr %gep.st, align 4
+  %y = load i32, ptr %gep.ld, align 4
+  %gep1 = getelementptr i32, ptr %phip, i64 %iv
+  store i32 %y, ptr %gep1, align 4
+  %iv.next = add nuw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Phi with incoming values loaded from the same address
+; FIXME: Incorrectly considered safe with runtime checks.
+define void @phi_with_loads_from_same_addr(ptr %p0, ptr %p1, ptr %x, i64 %n, i1 %c0) {
+; CHECK-LABEL: 'phi_with_loads_from_same_addr'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %ld1 = load ptr, ptr %x, align 8
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %ld1 = load ptr, ptr %x, align 8
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %ld1 = load ptr, ptr %x, align 8
+; CHECK-NEXT:        Against group GRP3:
+; CHECK-NEXT:          %ld2 = load ptr, ptr %x, align 8
+; CHECK-NEXT:      Check 3:
+; CHECK-NEXT:        Comparing group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Check 4:
+; CHECK-NEXT:        Comparing group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %phip, i64 %iv
+; CHECK-NEXT:        Against group GRP3:
+; CHECK-NEXT:          %ld2 = load ptr, ptr %x, align 8
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %ld1 High: (4 + %ld1))
+; CHECK-NEXT:            Member: %ld1
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %phip High: ((4 * %n) + %phip))
+; CHECK-NEXT:            Member: {%phip,+,4}<%loop>
+; CHECK-NEXT:        Group GRP2:
+; CHECK-NEXT:          (Low: %p0 High: ((4 * %n) + %p0))
+; CHECK-NEXT:            Member: {%p0,+,4}<%loop>
+; CHECK-NEXT:        Group GRP3:
+; CHECK-NEXT:          (Low: %ld2 High: (4 + %ld2))
+; CHECK-NEXT:            Member: %ld2
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %phip = select i1 %c0, ptr %p1, ptr %p0
+  %ld1 = load ptr, ptr %x
+  %ld2 = load ptr, ptr %x
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %latch ], [ 0, %entry ]
+  %gep0 = getelementptr i32, ptr %p0, i64 %iv
+  %v = load i32, ptr %gep0, align 4
+  br i1 %c0, label %if, label %else
+
+if:
+  store i32 %v, ptr %ld1, align 4
+  br label %latch
+
+else:
+  br label %latch
+
+latch:
+  %phi = phi ptr [ %ld1, %if ], [ %ld2, %else ]
+  %y = load i32, ptr %phi, align 4
+  %gep1 = getelementptr i32, ptr %phip, i64 %iv
+  store i32 %y, ptr %gep1, align 4
+  %iv.next = add nuw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; GEPs derived from different loads of the same address; runtime check can
+; disambiguate.
+define void @gep_from_loads_same_addr(ptr %p0, ptr %p1, ptr %x, i64 %n) {
+; CHECK-LABEL: 'gep_from_loads_same_addr'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.st = getelementptr i32, ptr %ld1, i64 1
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %p1, i64 %iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.st = getelementptr i32, ptr %ld1, i64 1
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep.st = getelementptr i32, ptr %ld1, i64 1
+; CHECK-NEXT:        Against group GRP3:
+; CHECK-NEXT:          %gep.ld = getelementptr i32, ptr %ld2, i64 1
+; CHECK-NEXT:      Check 3:
+; CHECK-NEXT:        Comparing group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %p1, i64 %iv
+; CHECK-NEXT:        Against group GRP2:
+; CHECK-NEXT:          %gep0 = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:      Check 4:
+; CHECK-NEXT:        Comparing group GRP1:
+; CHECK-NEXT:          %gep1 = getelementptr i32, ptr %p1, i64 %iv
+; CHECK-NEXT:        Against group GRP3:
+; CHECK-NEXT:          %gep.ld = getelementptr i32, ptr %ld2, i64 1
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: (4 + %ld1) High: (8 + %ld1))
+; CHECK-NEXT:            Member: (4 + %ld1)
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %p1 High: ((4 * %n) + %p1))
+; CHECK-NEXT:            Member: {%p1,+,4}<%loop>
+; CHECK-NEXT:        Group GRP2:
+; CHECK-NEXT:          (Low: %p0 High: ((4 * %n) + %p0))
+; CHECK-NEXT:            Member: {%p0,+,4}<%loop>
+; CHECK-NEXT:        Group GRP3:
+; CHECK-NEXT:          (Low: (4 + %ld2) High: (8 + %ld2))
+; CHECK-NEXT:            Member: (4 + %ld2)
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %ld1 = load ptr, ptr %x
+  %ld2 = load ptr, ptr %x
+  %gep.st = getelementptr i32, ptr %ld1, i64 1
+  %gep.ld = getelementptr i32, ptr %ld2, i64 1
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %gep0 = getelementptr i32, ptr %p0, i64 %iv
+  %v = load i32, ptr %gep0, align 4
+  store i32 %v, ptr %gep.st, align 4
+  %y = load i32, ptr %gep.ld, align 4
+  %gep1 = getelementptr i32, ptr %p1, i64 %iv
+  store i32 %y, ptr %gep1, align 4
+  %iv.next = add nuw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; One invariant, one strided: runtime check can disambiguate.
+define void @invariant_and_strided(ptr %p0, ptr %p1, i64 %n) {
+; CHECK-LABEL: 'invariant_and_strided'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group GRP0:
+; CHECK-NEXT:          %gep = getelementptr i32, ptr %p0, i64 %iv
+; CHECK-NEXT:        Against group GRP1:
+; CHECK-NEXT:        ptr %p1
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group GRP0:
+; CHECK-NEXT:          (Low: %p0 High: ((4 * %n) + %p0))
+; CHECK-NEXT:            Member: {%p0,+,4}<%loop>
+; CHECK-NEXT:        Group GRP1:
+; CHECK-NEXT:          (Low: %p1 High: (4 + %p1))
+; CHECK-NEXT:            Member: %p1
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
+  %val = load i32, ptr %p1, align 4
+  %gep = getelementptr i32, ptr %p0, i64 %iv
+  store i32 %val, ptr %gep, align 4
+  %iv.next = add nuw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Analysis/ScalarEvolution/two-loop-latches.ll b/llvm/test/Analysis/ScalarEvolution/two-loop-latches.ll
new file mode 100644
index 0000000000000..a73ec292ea697
--- /dev/null
+++ b/llvm/test/Analysis/ScalarEvolution/two-loop-latches.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -disable-output -passes='print<scalar-evolution>' < %s 2>&1 | FileCheck %s
+
+; Test where the outermost loop doesn't contain exactly one latch;
+; ScalarEvolution shouldn't crash in this case.
+
+define void @f(ptr %a0) {
+; CHECK-LABEL: 'f'
+; CHECK-NEXT:  Classifying expressions for: @f
+; CHECK-NEXT:    %phi1 = phi ptr [ null, %entry ], [ %inc, %b3 ], [ %inc, %b4 ]
+; CHECK-NEXT:    --> {null,+,-32}<%b1> U: [0,-31) S: [-9223372036854775808,9223372036854775777) Exits: <<Unknown>> LoopDispositions: { %b1: Computable, %b3: Invariant }
+; CHECK-NEXT:    %inc = getelementptr i8, ptr %phi1, i64 -32
+; CHECK-NEXT:    --> {(-32 + null)<nuw><nsw>,+,-32}<%b1> U: [0,-31) S: [-9223372036854775808,9223372036854775777) Exits: <<Unknown>> LoopDispositions: { %b1: Computable, %b3: Invariant }
+; CHECK-NEXT:    %phi2 = phi ptr [ %a0, %b2 ], [ %inc2, %b5 ]
+; CHECK-NEXT:    --> {%a0,+,1}<%b3> U: full-set S: full-set Exits: ((-1 * (ptrtoint ptr %a0 to i64)) + %a0) LoopDispositions: { %b3: Computable, %b1: Variant }
+; CHECK-NEXT:    %ld1 = load i8, ptr %phi2, align 1
+; CHECK-NEXT:    --> %ld1 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %b3: Variant, %b1: Variant }
+; CHECK-NEXT:    %ld2 = load i8, ptr null, align 1
+; CHECK-NEXT:    --> %ld2 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %b3: Variant, %b1: Variant }
+; CHECK-NEXT:    %inc2 = getelementptr i8, ptr %phi2, i64 1
+; CHECK-NEXT:    --> {(1 + %a0),+,1}<%b3> U: full-set S: full-set Exits: (1 + (-1 * (ptrtoint ptr %a0 to i64)) + %a0) LoopDispositions: { %b3: Computable, %b1: Variant }
+; CHECK-NEXT:  Determining loop execution counts for: @f
+; CHECK-NEXT:  Loop %b3: <multiple exits> backedge-taken count is (-1 * (ptrtoint ptr %a0 to i64))
+; CHECK-NEXT:    exit count for b3: (-1 * (ptrtoint ptr %a0 to i64))
+; CHECK-NEXT:    exit count for b4: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %b3: constant max backedge-taken count is i64 -1
+; CHECK-NEXT:  Loop %b3: symbolic max backedge-taken count is (-1 * (ptrtoint ptr %a0 to i64))
+; CHECK-NEXT:    symbolic max exit count for b3: (-1 * (ptrtoint ptr %a0 to i64))
+; CHECK-NEXT:    symbolic max exit count for b4: ***COULDNOTCOMPUTE***
+; CHECK-NEXT:  Loop %b3: Trip multiple is 1
+; CHECK-NEXT:  Loop %b1: Unpredictable backedge-taken count.
+; CHECK-NEXT:  Loop %b1: Unpredictable constant max backedge-taken count.
+; CHECK-NEXT:  Loop %b1: Unpredictable symbolic max backedge-taken count.
+;
+entry:
+  br label %b1
+b1:
+  %phi1 = phi ptr [ null, %entry ], [ %inc, %b3 ], [ %inc, %b4 ]
+  %cmp1 = icmp eq ptr %phi1, null
+  br i1 %cmp1, label %ret, label %b2
+b2:
+  %inc = getelementptr i8, ptr %phi1, i64 -32
+  br label %b3
+b3:
+  %phi2 = phi ptr [ %a0, %b2 ], [ %inc2, %b5 ]
+  %cmp2 = icmp eq ptr %phi2, null
+  br i1 %cmp2, label %b1, label %b4
+b4:
+  %ld1 = load i8, ptr %phi2, align 1
+  %ld2 = load i8, ptr null, align 1
+  %cmp3 = icmp slt i8 %ld1, %ld2
+  br i1 false, label %b1, label %b5
+b5:
+  %inc2 = getelementptr i8, ptr %phi2, i64 1
+  br label %b3
+ret:
+  ret void
+}
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index f2773c4f1d16f..46cb8cc1312dc 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -606,6 +606,34 @@ bb:
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %val, i32 16)
+define amdgpu_gs void @ds_add_gs_reg_rtn_i32(i32 %val, ptr addrspace(1) %out) {
+  %tmp0 = call i32 @llvm.amdgcn.ds.add.gs.reg.rtn.i32(i32 %val, i32 16)
+  store i32 %tmp0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %val, i32 32)
+define amdgpu_gs void @ds_add_gs_reg_rtn_i64(i32 %val, ptr addrspace(1) %out) {
+  %tmp0 = call i64 @llvm.amdgcn.ds.add.gs.reg.rtn.i64(i32 %val, i32 32)
+  store i64 %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %val, i32 16)
+define amdgpu_gs void @ds_sub_gs_reg_rtn_i32(i32 %val, ptr addrspace(1) %out) {
+  %tmp0 = call i32 @llvm.amdgcn.ds.sub.gs.reg.rtn.i32(i32 %val, i32 16)
+  store i32 %tmp0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %val, i32 32)
+define amdgpu_gs void @ds_sub_gs_reg_rtn_i64(i32 %val, ptr addrspace(1) %out) {
+  %tmp0 = call i64 @llvm.amdgcn.ds.sub.gs.reg.rtn.i64(i32 %val, i32 32)
+  store i64 %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
 ; CHECK: DIVERGENT: %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
 define amdgpu_kernel void @mfma_f32_16x16x32_f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, ptr addrspace(1) %out) {
   %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 immarg 0, i32 immarg 0, i32 immarg 0)
diff --git a/llvm/test/Assembler/2008-02-18-IntPointerCrash.ll b/llvm/test/Assembler/2008-02-18-IntPointerCrash.ll
index 0d8fbc08917a0..82818adfec309 100644
--- a/llvm/test/Assembler/2008-02-18-IntPointerCrash.ll
+++ b/llvm/test/Assembler/2008-02-18-IntPointerCrash.ll
@@ -1,7 +1,7 @@
 ; RUN: not llvm-as %s 2>&1 | FileCheck %s
 ; PR2060
 
-; CHECK: integer constant must have integer type
+; CHECK: integer/byte constant must have integer/byte type
 
 define ptr @foo() {
        ret ptr 0
diff --git a/llvm/test/Assembler/byte-invalid.ll b/llvm/test/Assembler/byte-invalid.ll
new file mode 100644
index 0000000000000..3cbecf25abacb
--- /dev/null
+++ b/llvm/test/Assembler/byte-invalid.ll
@@ -0,0 +1,73 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as -disable-output %t/zext-byte-to-int.ll   2>&1 | FileCheck %t/zext-byte-to-int.ll
+; RUN: not llvm-as -disable-output %t/sext-byte-to-int.ll   2>&1 | FileCheck %t/sext-byte-to-int.ll
+; RUN: not llvm-as -disable-output %t/trunc-byte-to-byte.ll 2>&1 | FileCheck %t/trunc-byte-to-byte.ll
+; RUN: not llvm-as -disable-output %t/zext-int-to-byte.ll   2>&1 | FileCheck %t/zext-int-to-byte.ll
+; RUN: not llvm-as -disable-output %t/sext-int-to-byte.ll   2>&1 | FileCheck %t/sext-int-to-byte.ll
+; RUN: not llvm-as -disable-output %t/trunc-int-to-byte.ll  2>&1 | FileCheck %t/trunc-int-to-byte.ll
+; RUN: not llvm-as -disable-output %t/trunc-byte-to-int.ll  2>&1 | FileCheck %t/trunc-byte-to-int.ll
+; RUN: not llvm-as -disable-output %t/lshr-byte.ll          2>&1 | FileCheck %t/lshr-byte.ll
+; RUN: not llvm-as -disable-output %t/icmp-byte.ll          2>&1 | FileCheck %t/icmp-byte.ll
+
+;--- zext-byte-to-int.ll
+; CHECK: invalid cast opcode for cast from 'b8' to 'i32'
+define void @test(b8 %b) {
+  %t = zext b8 %b to i32
+  ret void
+}
+
+;--- sext-byte-to-int.ll
+; CHECK: invalid cast opcode for cast from 'b8' to 'i32'
+define void @test(b8 %b) {
+  %t = sext b8 %b to i32
+  ret void
+}
+
+;--- trunc-byte-to-byte.ll
+; CHECK: invalid cast opcode for cast from 'b32' to 'b8'
+define void @test(b32 %b) {
+  %t = trunc b32 %b to b8
+  ret void
+}
+
+;--- zext-int-to-byte.ll
+; CHECK: invalid cast opcode for cast from 'i8' to 'b32'
+define void @test(i8 %v) {
+  %t = zext i8 %v to b32
+  ret void
+}
+
+;--- sext-int-to-byte.ll
+; CHECK: invalid cast opcode for cast from 'i8' to 'b32'
+define void @test(i8 %v) {
+  %t = sext i8 %v to b32
+  ret void
+}
+
+;--- trunc-int-to-byte.ll
+; CHECK: invalid cast opcode for cast from 'i32' to 'b8'
+define void @test(i32 %v) {
+  %t = trunc i32 %v to b8
+  ret void
+}
+
+;--- trunc-byte-to-int.ll
+; CHECK: invalid cast opcode for cast from 'b32' to 'i8'
+define void @test(b32 %b) {
+  %t = trunc b32 %b to i8
+  ret void
+}
+
+;--- lshr-byte.ll
+; CHECK: invalid operand type for instruction
+define void @test(b32 %b) {
+  %t = lshr b32 %b, 8
+  ret void
+}
+
+;--- icmp-byte.ll
+; CHECK: icmp requires integer operands
+define void @test(b8 %b1, b8 %b2) {
+  %cmp = icmp eq b8 %b1, %b2
+  ret void
+}
diff --git a/llvm/test/Assembler/byte.ll b/llvm/test/Assembler/byte.ll
new file mode 100644
index 0000000000000..1a02d0fab5df3
--- /dev/null
+++ b/llvm/test/Assembler/byte.ll
@@ -0,0 +1,101 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+; CHECK: common global [32 x b8] zeroinitializer
+; CHECK: constant [1 x b8] zeroinitializer
+; CHECK: constant [15 x b8] c"Hello, World!\0A\00"
+; CHECK: constant [15 x b8] c"Hello, World!\0A\00"
+ at a = common global [32 x b8] zeroinitializer, align 1
+ at b = constant [1 x b8] zeroinitializer
+ at c = constant [15 x b8] [b8 72, b8 101, b8 108, b8 108, b8 111, b8 44, b8 32,  b8 87, b8 111, b8 114, b8 108, b8 100, b8 33,  b8 10, b8 0]
+ at d = constant [15 x b8] c"Hello, World!\0A\00"
+
+define void @bytes(b1 %a, b3 %b, b5 %c, b8 %d, b16 %e, b32 %f, b64 %g, b128 %h, <8 x b5> %i, <2 x b64> %j) {
+; CHECK-LABEL: define void @bytes(
+; CHECK-SAME: b1 [[A:%.*]], b3 [[B:%.*]], b5 [[C:%.*]], b8 [[D:%.*]], b16 [[E:%.*]], b32 [[F:%.*]], b64 [[G:%.*]], b128 [[H:%.*]], <8 x b5> [[I:%.*]], <2 x b64> [[J:%.*]]) {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
+define void @byte_alloca() {
+; CHECK-LABEL: define void @byte_alloca() {
+; CHECK-NEXT:    [[B1:%.*]] = alloca b8, align 1
+; CHECK-NEXT:    [[B8:%.*]] = alloca b64, align 8
+; CHECK-NEXT:    [[V:%.*]] = alloca <4 x b64>, align 32
+; CHECK-NEXT:    [[A:%.*]] = alloca [4 x b64], align 8
+; CHECK-NEXT:    ret void
+;
+  %b1 = alloca b8
+  %b8 = alloca b64
+  %v  = alloca <4 x b64>
+  %a  = alloca [4 x b64]
+  ret void
+}
+
+define void @byte_load_store(ptr %ptr) {
+; CHECK-LABEL: define void @byte_load_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[B:%.*]] = load b8, ptr [[PTR]], align 1
+; CHECK-NEXT:    store b8 [[B]], ptr [[PTR]], align 1
+; CHECK-NEXT:    store b8 0, ptr [[PTR]], align 1
+; CHECK-NEXT:    [[V:%.*]] = load <4 x b64>, ptr [[PTR]], align 32
+; CHECK-NEXT:    store <4 x b64> [[V]], ptr [[PTR]], align 32
+; CHECK-NEXT:    store <4 x b64> <b64 0, b64 1, b64 2, b64 3>, ptr [[PTR]], align 32
+; CHECK-NEXT:    [[A:%.*]] = load [4 x b8], ptr [[PTR]], align 1
+; CHECK-NEXT:    store [4 x b8] [[A]], ptr [[PTR]], align 1
+; CHECK-NEXT:    store [4 x b8] c"\00\01\02\03", ptr [[PTR]], align 1
+; CHECK-NEXT:    ret void
+;
+  %b = load b8, ptr %ptr
+  store b8 %b, ptr %ptr
+  store b8 0, ptr %ptr
+  %v = load <4 x b64>, ptr %ptr
+  store <4 x b64> %v, ptr %ptr
+  store <4 x b64> <b64 0, b64 1, b64 2, b64 3>, ptr %ptr
+  %a = load [4 x b8], ptr %ptr
+  store [4 x b8] %a, ptr %ptr
+  store [4 x b8] [b8 0, b8 1, b8 2, b8 3], ptr %ptr
+  ret void
+}
+
+define void @bitcasts(i64 %i, b64 %b, ptr %p) {
+; CHECK-LABEL: define void @bitcasts(
+; CHECK-SAME: i64 [[I:%.*]], b64 [[B:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast ptr [[P]] to b64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[I]] to b64
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast b64 [[B]] to <8 x b8>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast b64 [[B]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast b64 [[B]] to ptr
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x b8> [[TMP3]] to <2 x b32>
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <2 x b32> [[TMP6]] to b64
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast <2 x b32> splat (b32 1) to b64
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast <8 x b8> [[TMP3]] to <4 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <2 x b32> [[TMP6]] to ptr
+; CHECK-NEXT:    ret void
+;
+  %1 = bitcast ptr %p to b64
+  %2 = bitcast i64 %i to b64
+  %3 = bitcast b64 %b to <8 x b8>
+  %4 = bitcast b64 %b to i64
+  %5 = bitcast b64 %b to ptr
+  %6 = bitcast <8 x b8> %3 to <2 x b32>
+  %7 = bitcast <2 x b32> %6 to b64
+  %8 = bitcast <2 x b32> <b32 1, b32 1> to b64
+  %9 = bitcast <8 x b8> %3 to <4 x i16>
+  %10 = bitcast <2 x b32> %6 to ptr
+  ret void
+}
+
+define void @freeze(b3 %t, b64 %b, <4 x b64> %v) {
+; CHECK-LABEL: define void @freeze(
+; CHECK-SAME: b3 [[T:%.*]], b64 [[B:%.*]], <4 x b64> [[V:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze b3 [[T]]
+; CHECK-NEXT:    [[TMP2:%.*]] = freeze b64 [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze <4 x b64> [[V]]
+; CHECK-NEXT:    ret void
+;
+  %1 = freeze b3 %t
+  %2 = freeze b64 %b
+  %3 = freeze <4 x b64> %v
+  ret void
+}
diff --git a/llvm/test/Assembler/invalid-inttype.ll b/llvm/test/Assembler/invalid-inttype.ll
index 9e3c31148af2d..ef050d599875e 100644
--- a/llvm/test/Assembler/invalid-inttype.ll
+++ b/llvm/test/Assembler/invalid-inttype.ll
@@ -1,5 +1,5 @@
 ; RUN: not llvm-as --disable-output %s 2>&1 | FileCheck -DFILE=%s %s
 
 ; i8388609 is the smallest integer type that can't be represented in LLVM IR
-; CHECK: [[FILE]]:[[@LINE+1]]:21: error: bitwidth for integer type out of range
+; CHECK: [[FILE]]:[[@LINE+1]]:21: error: bitwidth for integer or byte type out of range
 @i2 = common global i8388609 0, align 4
diff --git a/llvm/test/Bindings/llvm-c/byte.ll b/llvm/test/Bindings/llvm-c/byte.ll
new file mode 100644
index 0000000000000..93b9819c3c9b9
--- /dev/null
+++ b/llvm/test/Bindings/llvm-c/byte.ll
@@ -0,0 +1,11 @@
+; RUN: llvm-as < %s | llvm-dis > %t.orig
+; RUN: llvm-as < %s | llvm-c-test --echo > %t.echo
+; RUN: diff -w %t.orig %t.echo
+
+define void @foo(b8 %a, b16 %b, b64 %c) {
+  %1 = alloca b16, align 2
+  store b16 %b, ptr %1, align 2
+  %2 = load b16, ptr %1, align 2
+  %3 = bitcast b16 %2 to <2 x b8>
+  ret void
+}
diff --git a/llvm/test/Bindings/llvm-c/echo.ll b/llvm/test/Bindings/llvm-c/echo.ll
index ab1771d1f879f..0202990018045 100644
--- a/llvm/test/Bindings/llvm-c/echo.ll
+++ b/llvm/test/Bindings/llvm-c/echo.ll
@@ -72,6 +72,7 @@ define void @types() {
   %10 = alloca ptr addrspace(5), align 8
   %11 = alloca <5 x ptr>, align 64
   %12 = alloca <1 x i64>, align 8
+  %13 = alloca b8, align 1
   ret void
 }
 
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index 6cce477e2af04..06e7fc0096a53 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1288,6 +1288,8 @@ define void @typesystem() {
   ; CHECK: %t9 = alloca <4 x i32>
   %t10 = alloca <vscale x 4 x i32>
   ; CHECK: %t10 = alloca <vscale x 4 x i32>
+  %t11 = alloca b8
+  ; CHECK: %t11 = alloca b8
 
   ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
index b52957767de4d..cfd517e57c56d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-1x.ll
@@ -1,34 +1,71 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple aarch64 -O0 -global-isel -o - %s | FileCheck %s
+; RUN: llc -mtriple aarch64 -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64 -global-isel -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple aarch64 -O0 -global-isel -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GIO0
 
 define <1 x i1> @shuffle_extract_4(<8 x i1> %a, <8 x i1> %b) {
-; CHECK-LABEL: shuffle_extract_4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-NEXT:    umov w8, v0.h[4]
-; CHECK-NEXT:    and w0, w8, #0x1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffle_extract_4:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    umov w0, v0.b[4]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffle_extract_4:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GI-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-GIO0-LABEL: shuffle_extract_4:
+; CHECK-GIO0:       // %bb.0:
+; CHECK-GIO0-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-GIO0-NEXT:    umov w8, v0.h[4]
+; CHECK-GIO0-NEXT:    and w0, w8, #0x1
+; CHECK-GIO0-NEXT:    ret
   %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 4>
   ret <1 x i1> %extractvec60
 }
 
 define <1 x i1> @shuffle_extract_12(<8 x i1> %a, <8 x i1> %b) {
-; CHECK-LABEL: shuffle_extract_12:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v0.8h, v1.8b, #0
-; CHECK-NEXT:    umov w8, v0.h[4]
-; CHECK-NEXT:    and w0, w8, #0x1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffle_extract_12:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    umov w0, v1.b[4]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffle_extract_12:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-GI-NEXT:    umov w8, v0.h[4]
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-GIO0-LABEL: shuffle_extract_12:
+; CHECK-GIO0:       // %bb.0:
+; CHECK-GIO0-NEXT:    ushll v0.8h, v1.8b, #0
+; CHECK-GIO0-NEXT:    umov w8, v0.h[4]
+; CHECK-GIO0-NEXT:    and w0, w8, #0x1
+; CHECK-GIO0-NEXT:    ret
   %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 12>
   ret <1 x i1> %extractvec60
 }
 
 define <1 x i1> @shuffle_extract_p(<8 x i1> %a, <8 x i1> %b) {
-; CHECK-LABEL: shuffle_extract_p:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // implicit-def: $w8
-; CHECK-NEXT:    and w0, w8, #0x1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shuffle_extract_p:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shuffle_extract_p:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w0, wzr
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-GIO0-LABEL: shuffle_extract_p:
+; CHECK-GIO0:       // %bb.0:
+; CHECK-GIO0-NEXT:    // implicit-def: $w8
+; CHECK-GIO0-NEXT:    and w0, w8, #0x1
+; CHECK-GIO0-NEXT:    ret
   %extractvec60 = shufflevector <8 x i1> %a, <8 x i1> %b, <1 x i32> <i32 poison>
   ret <1 x i1> %extractvec60
 }
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 620041253ecfc..aa9b42f47c24c 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -98,6 +98,9 @@
 ; CHECK-NEXT:       Interleaved Access Pass
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Type Promotion
+; CHECK-NEXT:       Post-Dominator Tree Construction
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
diff --git a/llvm/test/CodeGen/AArch64/ccmp-look-through-copy.mir b/llvm/test/CodeGen/AArch64/ccmp-look-through-copy.mir
new file mode 100644
index 0000000000000..8c0017ae1343c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ccmp-look-through-copy.mir
@@ -0,0 +1,45 @@
+# RUN: llc -o - %s -mtriple=aarch64 -run-pass=aarch64-ccmp -aarch64-stress-ccmp | FileCheck %s
+---
+name: ccmp-look-through-copy
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr64 }
+  - { id: 1, class: gpr64 }
+  - { id: 4, class: gpr64 }
+  - { id: 6, class: gpr64 }
+  - { id: 7, class: gpr64 }
+  - { id: 8, class: gpr64 }
+  - { id: 9, class: gpr64 }
+body: |
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $x0, $x1
+  
+    ; CHECK-LABEL: name: ccmp-look-through-copy
+    ; CHECK: bb.0:
+    ; CHECK: CCMPXr
+
+    %0:gpr64 = COPY $x0
+    %1:gpr64 = COPY $x1
+    %4:gpr64 = COPY %0
+    %7:gpr64 = COPY %4
+    %6:gpr64 = SUBSXrr %0, %1, implicit-def $nzcv
+    Bcc 11, %bb.2, implicit $nzcv
+    B %bb.1
+  
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  
+    %8:gpr64 = SUBSXrr %1, %0, implicit-def $nzcv
+    Bcc 12, %bb.2, implicit $nzcv
+    B %bb.3
+  
+  bb.2:
+    %9:gpr64 = PHI %4, %bb.0, %7, %bb.1
+    $x0 = COPY %9
+    RET_ReallyLR implicit $x0
+  
+  bb.3:
+    $x0 = COPY %0
+    RET_ReallyLR implicit $x0
+...
diff --git a/llvm/test/CodeGen/AArch64/imm-splat-ops.ll b/llvm/test/CodeGen/AArch64/imm-splat-ops.ll
index 90aeecd82a0e2..fae35987a8f7d 100644
--- a/llvm/test/CodeGen/AArch64/imm-splat-ops.ll
+++ b/llvm/test/CodeGen/AArch64/imm-splat-ops.ll
@@ -235,10 +235,8 @@ define <2 x i64> @mul_v2i64(<2 x i64> %a) {
 ;
 ; CHECK-SVE-LABEL: mul_v2i64:
 ; CHECK-SVE:       // %bb.0: // %entry
-; CHECK-SVE-NEXT:    mov z1.d, #123 // =0x7b
-; CHECK-SVE-NEXT:    ptrue p0.d, vl2
 ; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-SVE-NEXT:    mul z0.d, p0/m, z0.d, z1.d
+; CHECK-SVE-NEXT:    mul z0.d, z0.d, #123
 ; CHECK-SVE-NEXT:    // kill: def $q0 killed $q0 killed $z0
 ; CHECK-SVE-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/sve-asrd.ll b/llvm/test/CodeGen/AArch64/sve-asrd.ll
index 66db1a5dc1dbf..e7079a7c17566 100644
--- a/llvm/test/CodeGen/AArch64/sve-asrd.ll
+++ b/llvm/test/CodeGen/AArch64/sve-asrd.ll
@@ -7,23 +7,18 @@ target triple = "aarch64-unknown-linux-gnu"
 define <16 x i16> @sdiv_by_one_v16i16(<16 x i16> %a) vscale_range(2,2) {
 ; CHECK-LABEL: sdiv_by_one_v16i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ptrue p0.h
-; CHECK-NEXT:    adrp x8, .LCPI0_0
-; CHECK-NEXT:    add x8, x8, :lo12:.LCPI0_0
+; CHECK-NEXT:    mov z2.h, #1 // =0x1
 ; CHECK-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
-; CHECK-NEXT:    ld1h { z2.h }, p0/z, [x8]
-; CHECK-NEXT:    sunpklo z0.s, z0.h
-; CHECK-NEXT:    sunpklo z1.s, z1.h
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    sunpklo z3.s, z2.h
-; CHECK-NEXT:    ext z2.b, z2.b, z2.b, #16
+; CHECK-NEXT:    sunpklo z1.s, z1.h
+; CHECK-NEXT:    sunpklo z0.s, z0.h
 ; CHECK-NEXT:    sunpklo z2.s, z2.h
-; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z3.s
 ; CHECK-NEXT:    sdiv z1.s, p0/m, z1.s, z2.s
+; CHECK-NEXT:    sdiv z0.s, p0/m, z0.s, z2.s
 ; CHECK-NEXT:    ptrue p0.h, vl8
-; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    uzp1 z1.h, z1.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z0.h, z0.h
 ; CHECK-NEXT:    splice z0.h, p0, z0.h, z1.h
 ; CHECK-NEXT:    movprfx z1, z0
 ; CHECK-NEXT:    ext z1.b, z1.b, z0.b, #16
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll
index 6e8d477fc3ad5..915db8d8085ef 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-no-vscale-range.ll
@@ -16,6 +16,17 @@ define <2 x i64> @mul_v2i64(<2 x i64> %op1, <2 x i64> %op2) #0 {
   ret <2 x i64> %res
 }
 
+define <2 x i64> @mul_imm_v2i64(<2 x i64> %op1) #0 {
+; CHECK-LABEL: mul_imm_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
+; CHECK-NEXT:    mul z0.d, z0.d, #123
+; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
+; CHECK-NEXT:    ret
+  %res = mul <2 x i64> %op1, splat (i64 123)
+  ret <2 x i64> %res
+}
+
 define <4 x i32> @sdiv_v4i32(<4 x i32> %op1, <4 x i32> %op2) #0 {
 ; CHECK-LABEL: sdiv_v4i32:
 ; CHECK:       // %bb.0:
diff --git a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
index 8c66d237e686d..3f97b74db5afd 100644
--- a/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
+++ b/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll
@@ -1303,8 +1303,8 @@ entry:
 define <2 x i32> @udot_v16i8tov2i32(<2 x i32> %acc, <16 x i8> %input) "target-features"="+dotprod" {
 ; CHECK-SVE2-LABEL: udot_v16i8tov2i32:
 ; CHECK-SVE2:       // %bb.0: // %entry
-; CHECK-SVE2-NEXT:    movi v2.16b, #1
 ; CHECK-SVE2-NEXT:    fmov d0, d0
+; CHECK-SVE2-NEXT:    mov z2.b, #1 // =0x1
 ; CHECK-SVE2-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-SVE2-NEXT:    udot z0.s, z1.b, z2.b
 ; CHECK-SVE2-NEXT:    addp v0.4s, v0.4s, v0.4s
@@ -1313,8 +1313,8 @@ define <2 x i32> @udot_v16i8tov2i32(<2 x i32> %acc, <16 x i8> %input) "target-fe
 ;
 ; CHECK-SVE2-I8MM-LABEL: udot_v16i8tov2i32:
 ; CHECK-SVE2-I8MM:       // %bb.0: // %entry
-; CHECK-SVE2-I8MM-NEXT:    movi v2.16b, #1
 ; CHECK-SVE2-I8MM-NEXT:    fmov d0, d0
+; CHECK-SVE2-I8MM-NEXT:    mov z2.b, #1 // =0x1
 ; CHECK-SVE2-I8MM-NEXT:    // kill: def $q1 killed $q1 def $z1
 ; CHECK-SVE2-I8MM-NEXT:    udot z0.s, z1.b, z2.b
 ; CHECK-SVE2-I8MM-NEXT:    addp v0.4s, v0.4s, v0.4s
diff --git a/llvm/test/CodeGen/AArch64/sve2p3-dots-partial-reduction.ll b/llvm/test/CodeGen/AArch64/sve2p3-dots-partial-reduction.ll
new file mode 100644
index 0000000000000..02fad3b62d46b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p3-dots-partial-reduction.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p3 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2p3 -force-streaming < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s --check-prefix=CHECK-NOSVE2P3
+
+define <vscale x 8 x i16> @udot_btoh_scalable(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: udot_btoh_scalable:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    udot z0.h, z1.b, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-NOSVE2P3-LABEL: udot_btoh_scalable:
+; CHECK-NOSVE2P3:       // %bb.0: // %entry
+; CHECK-NOSVE2P3-NEXT:    uunpklo z3.h, z1.b
+; CHECK-NOSVE2P3-NEXT:    uunpklo z4.h, z2.b
+; CHECK-NOSVE2P3-NEXT:    ptrue p0.h
+; CHECK-NOSVE2P3-NEXT:    uunpkhi z1.h, z1.b
+; CHECK-NOSVE2P3-NEXT:    uunpkhi z2.h, z2.b
+; CHECK-NOSVE2P3-NEXT:    mla z0.h, p0/m, z3.h, z4.h
+; CHECK-NOSVE2P3-NEXT:    mla z0.h, p0/m, z1.h, z2.h
+; CHECK-NOSVE2P3-NEXT:    ret
+entry:
+  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i16>
+  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i16>
+  %mult = mul nuw nsw <vscale x 16 x i16> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 8 x i16> @llvm.vector.partial.reduce.add(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %mult)
+  ret <vscale x 8 x i16> %partial.reduce
+}
+
+define <vscale x 8 x i16> @sdot_btoh_scalable(<vscale x 8 x i16> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: sdot_btoh_scalable:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sdot z0.h, z1.b, z2.b
+; CHECK-NEXT:    ret
+;
+; CHECK-NOSVE2P3-LABEL: sdot_btoh_scalable:
+; CHECK-NOSVE2P3:       // %bb.0: // %entry
+; CHECK-NOSVE2P3-NEXT:    sunpklo z3.h, z1.b
+; CHECK-NOSVE2P3-NEXT:    sunpklo z4.h, z2.b
+; CHECK-NOSVE2P3-NEXT:    ptrue p0.h
+; CHECK-NOSVE2P3-NEXT:    sunpkhi z1.h, z1.b
+; CHECK-NOSVE2P3-NEXT:    sunpkhi z2.h, z2.b
+; CHECK-NOSVE2P3-NEXT:    mla z0.h, p0/m, z3.h, z4.h
+; CHECK-NOSVE2P3-NEXT:    mla z0.h, p0/m, z1.h, z2.h
+; CHECK-NOSVE2P3-NEXT:    ret
+entry:
+  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i16>
+  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i16>
+  %mult = mul nuw nsw <vscale x 16 x i16> %a.wide, %b.wide
+  %partial.reduce = tail call <vscale x 8 x i16> @llvm.vector.partial.reduce.add(<vscale x 8 x i16> %acc, <vscale x 16 x i16> %mult)
+  ret <vscale x 8 x i16> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 12c13e8337e8d..7ef38401bf76e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -4778,7 +4778,7 @@ entry:
 define i64 @extract_scalable(<2 x i32> %0) "target-features"="+sve2" {
 ; CHECK-SD-LABEL: extract_scalable:
 ; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    movi v1.2s, #1
+; CHECK-SD-NEXT:    mov z1.s, #1 // =0x1
 ; CHECK-SD-NEXT:    ptrue p0.s, vl2
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $z0
 ; CHECK-SD-NEXT:    sdivr z0.s, p0/m, z0.s, z1.s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw-fmin-fmax.ll
new file mode 100644
index 0000000000000..1248f4a85d927
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw-fmin-fmax.ll
@@ -0,0 +1,596 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -stop-after=instruction-select < %s | FileCheck %s --check-prefixes=GFX10
+
+; --- atomicrmw fmax: global f32/f64, flat f32/f64, local f32/f64 ---
+
+define void @atomicrmw_fmax_global_f32_vv_noret(ptr addrspace(1) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_global_f32_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   GLOBAL_ATOMIC_FMAX [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmax ptr addrspace(1) %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @atomicrmw_fmax_global_f32_vv_ret(ptr addrspace(1) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_global_f32_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[GLOBAL_ATOMIC_FMAX_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_FMAX_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_FMAX_RTN]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
+  %old = atomicrmw fmax ptr addrspace(1) %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %old
+}
+
+define void @atomicrmw_fmax_global_f64_vv_noret(ptr addrspace(1) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_global_f64_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   GLOBAL_ATOMIC_MAX_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmax ptr addrspace(1) %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @atomicrmw_fmax_global_f64_vv_ret(ptr addrspace(1) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_global_f64_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   [[GLOBAL_ATOMIC_MAX_F64_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_MAX_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_MAX_F64_RTN]].sub0
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_MAX_F64_RTN]].sub1
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY4]]
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY5]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %old = atomicrmw fmax ptr addrspace(1) %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %old
+}
+
+define void @atomicrmw_fmax_flat_f32_vv_noret(ptr %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_flat_f32_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   FLAT_ATOMIC_FMAX [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmax ptr %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @atomicrmw_fmax_flat_f32_vv_ret(ptr %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_flat_f32_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[FLAT_ATOMIC_FMAX_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_FMAX_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_FMAX_RTN]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
+  %old = atomicrmw fmax ptr %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %old
+}
+
+define void @atomicrmw_fmax_flat_f64_vv_noret(ptr %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_flat_f64_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $src_private_base
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY1]], [[COPY7]], implicit $exec
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[V_CMP_NE_U32_e64_]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[COPY8]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2.Flow:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec = SI_ELSE [[SI_IF]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3.atomicrmw.private:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GFX10-NEXT:   [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], [[COPY9]], implicit $exec
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX10-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY]], [[V_CMP_NE_U64_e64_]], implicit $exec
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32) from %ir.7, align 8, addrspace 5)
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %ir.7 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BUFFER_LOAD_DWORD_OFFEN]], %subreg.sub0, [[BUFFER_LOAD_DWORD_OFFEN1]], %subreg.sub1
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE2]], 0, [[REG_SEQUENCE2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE1]], 0, [[REG_SEQUENCE1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_2:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[V_MAX_F64_e64_]], 0, [[V_MAX_F64_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_MAX_F64_e64_2]].sub0
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_MAX_F64_e64_2]].sub1
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY11]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.7, align 8, addrspace 5)
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY12]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %ir.7 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   S_BRANCH %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4.atomicrmw.global:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   FLAT_ATOMIC_MAX_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64) on %ir.ptr, !noalias.addrspace !1)
+  ; GFX10-NEXT:   S_BRANCH %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5.atomicrmw.phi:
+  ; GFX10-NEXT:   SI_END_CF [[SI_ELSE]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmax ptr %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @atomicrmw_fmax_flat_f64_vv_ret(ptr %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_flat_f64_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $src_private_base
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY1]], [[COPY7]], implicit $exec
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[V_CMP_NE_U32_e64_]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[COPY8]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2.Flow:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:vreg_64 = PHI %19, %bb.4, [[DEF]], %bb.1
+  ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec = SI_ELSE [[SI_IF]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3.atomicrmw.private:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GFX10-NEXT:   [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], [[COPY9]], implicit $exec
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX10-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY]], [[V_CMP_NE_U64_e64_]], implicit $exec
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32) from %ir.8, align 8, addrspace 5)
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %ir.8 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BUFFER_LOAD_DWORD_OFFEN]], %subreg.sub0, [[BUFFER_LOAD_DWORD_OFFEN1]], %subreg.sub1
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE2]], 0, [[REG_SEQUENCE2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE1]], 0, [[REG_SEQUENCE1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_2:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[V_MAX_F64_e64_]], 0, [[V_MAX_F64_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_MAX_F64_e64_2]].sub0
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_MAX_F64_e64_2]].sub1
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY11]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.8, align 8, addrspace 5)
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY12]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %ir.8 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   S_BRANCH %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4.atomicrmw.global:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[FLAT_ATOMIC_MAX_F64_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_MAX_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64) on %ir.ptr, !noalias.addrspace !1)
+  ; GFX10-NEXT:   S_BRANCH %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5.atomicrmw.phi:
+  ; GFX10-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:vreg_64 = PHI [[PHI]], %bb.2, [[REG_SEQUENCE2]], %bb.3
+  ; GFX10-NEXT:   SI_END_CF [[SI_ELSE]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6.atomicrmw.end:
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY13]]
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY14]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %old = atomicrmw fmax ptr %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %old
+}
+
+define void @atomicrmw_fmax_local_f32_vv_noret(ptr addrspace(3) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_local_f32_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   DS_MAX_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmax ptr addrspace(3) %ptr, float %val seq_cst
+  ret void
+}
+
+define float @atomicrmw_fmax_local_f32_vv_ret(ptr addrspace(3) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_local_f32_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
+  %old = atomicrmw fmax ptr addrspace(3) %ptr, float %val seq_cst
+  ret float %old
+}
+
+define void @atomicrmw_fmax_local_f64_vv_noret(ptr addrspace(3) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_local_f64_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX10-NEXT:   DS_MAX_F64_gfx9 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmax ptr addrspace(3) %ptr, double %val seq_cst
+  ret void
+}
+
+define double @atomicrmw_fmax_local_f64_vv_ret(ptr addrspace(3) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmax_local_f64_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX10-NEXT:   [[DS_MAX_RTN_F64_gfx9_:%[0-9]+]]:vreg_64 = DS_MAX_RTN_F64_gfx9 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DS_MAX_RTN_F64_gfx9_]].sub0
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DS_MAX_RTN_F64_gfx9_]].sub1
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY3]]
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY4]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %old = atomicrmw fmax ptr addrspace(3) %ptr, double %val seq_cst
+  ret double %old
+}
+
+; --- atomicrmw fmin: global f32/f64, flat f32/f64, local f32/f64 ---
+
+define void @atomicrmw_fmin_global_f32_vv_noret(ptr addrspace(1) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_global_f32_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   GLOBAL_ATOMIC_FMIN [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmin ptr addrspace(1) %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @atomicrmw_fmin_global_f32_vv_ret(ptr addrspace(1) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_global_f32_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[GLOBAL_ATOMIC_FMIN_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_FMIN_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[GLOBAL_ATOMIC_FMIN_RTN]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
+  %old = atomicrmw fmin ptr addrspace(1) %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %old
+}
+
+define void @atomicrmw_fmin_global_f64_vv_noret(ptr addrspace(1) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_global_f64_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   GLOBAL_ATOMIC_MIN_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmin ptr addrspace(1) %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @atomicrmw_fmin_global_f64_vv_ret(ptr addrspace(1) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_global_f64_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   [[GLOBAL_ATOMIC_MIN_F64_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_MIN_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 1)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_MIN_F64_RTN]].sub0
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_ATOMIC_MIN_F64_RTN]].sub1
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY4]]
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY5]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %old = atomicrmw fmin ptr addrspace(1) %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %old
+}
+
+define void @atomicrmw_fmin_flat_f32_vv_noret(ptr %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_flat_f32_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   FLAT_ATOMIC_FMIN [[REG_SEQUENCE]], [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmin ptr %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define float @atomicrmw_fmin_flat_f32_vv_ret(ptr %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_flat_f32_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[FLAT_ATOMIC_FMIN_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_FMIN_RTN [[REG_SEQUENCE]], [[COPY2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32) on %ir.ptr)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_FMIN_RTN]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
+  %old = atomicrmw fmin ptr %ptr, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret float %old
+}
+
+define void @atomicrmw_fmin_flat_f64_vv_noret(ptr %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_flat_f64_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $src_private_base
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY1]], [[COPY7]], implicit $exec
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[V_CMP_NE_U32_e64_]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[COPY8]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2.Flow:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec = SI_ELSE [[SI_IF]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3.atomicrmw.private:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GFX10-NEXT:   [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], [[COPY9]], implicit $exec
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX10-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY]], [[V_CMP_NE_U64_e64_]], implicit $exec
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32) from %ir.7, align 8, addrspace 5)
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %ir.7 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BUFFER_LOAD_DWORD_OFFEN]], %subreg.sub0, [[BUFFER_LOAD_DWORD_OFFEN1]], %subreg.sub1
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE2]], 0, [[REG_SEQUENCE2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE1]], 0, [[REG_SEQUENCE1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MIN_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[V_MAX_F64_e64_]], 0, [[V_MAX_F64_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_MIN_F64_e64_]].sub0
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_MIN_F64_e64_]].sub1
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY11]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.7, align 8, addrspace 5)
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY12]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %ir.7 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   S_BRANCH %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4.atomicrmw.global:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   FLAT_ATOMIC_MIN_F64 [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64) on %ir.ptr, !noalias.addrspace !1)
+  ; GFX10-NEXT:   S_BRANCH %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5.atomicrmw.phi:
+  ; GFX10-NEXT:   SI_END_CF [[SI_ELSE]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmin ptr %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
+define double @atomicrmw_fmin_flat_f64_vv_ret(ptr %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_flat_f64_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX10-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_64 = COPY $src_private_base
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub0
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]].sub1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 [[COPY1]], [[COPY7]], implicit $exec
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[V_CMP_NE_U32_e64_]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[COPY8]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2.Flow:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:vreg_64 = PHI %19, %bb.4, [[DEF]], %bb.1
+  ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec = SI_ELSE [[SI_IF]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   S_BRANCH %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3.atomicrmw.private:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+  ; GFX10-NEXT:   [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
+  ; GFX10-NEXT:   [[V_CMP_NE_U64_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_NE_U64_e64 [[REG_SEQUENCE]], [[COPY9]], implicit $exec
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; GFX10-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, [[COPY10]], 0, [[COPY]], [[V_CMP_NE_U64_e64_]], implicit $exec
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32) from %ir.8, align 8, addrspace 5)
+  ; GFX10-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (load (s32) from %ir.8 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[BUFFER_LOAD_DWORD_OFFEN]], %subreg.sub0, [[BUFFER_LOAD_DWORD_OFFEN1]], %subreg.sub1
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE2]], 0, [[REG_SEQUENCE2]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MAX_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_MAX_F64_e64 0, [[REG_SEQUENCE1]], 0, [[REG_SEQUENCE1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[V_MIN_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MIN_F64_e64 0, [[V_MAX_F64_e64_]], 0, [[V_MAX_F64_e64_1]], 0, 0, implicit $mode, implicit $exec
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[V_MIN_F64_e64_]].sub0
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[V_MIN_F64_e64_]].sub1
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY11]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into %ir.8, align 8, addrspace 5)
+  ; GFX10-NEXT:   BUFFER_STORE_DWORD_OFFEN [[COPY12]], [[V_CNDMASK_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into %ir.8 + 4, basealign 8, addrspace 5)
+  ; GFX10-NEXT:   S_BRANCH %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4.atomicrmw.global:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[FLAT_ATOMIC_MIN_F64_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_MIN_F64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64) on %ir.ptr, !noalias.addrspace !1)
+  ; GFX10-NEXT:   S_BRANCH %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5.atomicrmw.phi:
+  ; GFX10-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:vreg_64 = PHI [[PHI]], %bb.2, [[REG_SEQUENCE2]], %bb.3
+  ; GFX10-NEXT:   SI_END_CF [[SI_ELSE]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6.atomicrmw.end:
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub0
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[PHI1]].sub1
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY13]]
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY14]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %old = atomicrmw fmin ptr %ptr, double %val seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %old
+}
+
+define void @atomicrmw_fmin_local_f32_vv_noret(ptr addrspace(3) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_local_f32_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   DS_MIN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmin ptr addrspace(3) %ptr, float %val seq_cst
+  ret void
+}
+
+define float @atomicrmw_fmin_local_f32_vv_ret(ptr addrspace(3) %ptr, float %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_local_f32_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[DS_MIN_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MIN_RTN_F32_gfx9 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   $vgpr0 = COPY [[DS_MIN_RTN_F32_gfx9_]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0
+  %old = atomicrmw fmin ptr addrspace(3) %ptr, float %val seq_cst
+  ret float %old
+}
+
+define void @atomicrmw_fmin_local_f64_vv_noret(ptr addrspace(3) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_local_f64_vv_noret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX10-NEXT:   DS_MIN_F64_gfx9 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   SI_RETURN
+  %old = atomicrmw fmin ptr addrspace(3) %ptr, double %val seq_cst
+  ret void
+}
+
+define double @atomicrmw_fmin_local_f64_vv_ret(ptr addrspace(3) %ptr, double %val) {
+  ; GFX10-LABEL: name: atomicrmw_fmin_local_f64_vv_ret
+  ; GFX10: bb.1 (%ir-block.0):
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX10-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX10-NEXT:   [[DS_MIN_RTN_F64_gfx9_:%[0-9]+]]:vreg_64 = DS_MIN_RTN_F64_gfx9 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load store seq_cst (s64) on %ir.ptr, addrspace 3)
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DS_MIN_RTN_F64_gfx9_]].sub0
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DS_MIN_RTN_F64_gfx9_]].sub1
+  ; GFX10-NEXT:   $vgpr0 = COPY [[COPY3]]
+  ; GFX10-NEXT:   $vgpr1 = COPY [[COPY4]]
+  ; GFX10-NEXT:   SI_RETURN implicit $vgpr0, implicit $vgpr1
+  %old = atomicrmw fmin ptr addrspace(3) %ptr, double %val seq_cst
+  ret double %old
+}
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
index 9adb56cb0861e..1973dd8cb58ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
 
 ; TODO: Delete this and add run lines to use *-atomicrmw-fmax.ll tests
 
@@ -1509,13 +1509,15 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v2, s16
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f32_e32 v3, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX942-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX942-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX942-NEXT:    v_max_f32_e32 v4, v0, v3
@@ -1559,13 +1561,14 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_max_f32_e32 v4, v0, v3
@@ -1585,13 +1588,14 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v3, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX908-NEXT:    v_max_f32_e32 v4, v0, v3
@@ -1612,13 +1616,14 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
 ; GFX8-NEXT:    v_max_f32_e32 v4, v0, v3
@@ -1669,9 +1674,12 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX942-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
@@ -1718,9 +1726,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
@@ -1743,9 +1753,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX908-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
@@ -1769,9 +1781,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
 ; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
@@ -1810,17 +1824,22 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v8, s[0:3], null offen
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], null offen
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1852,17 +1871,21 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v8, s[0:3], 0 offen
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc
@@ -1903,12 +1926,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v8, s20
-; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v8, s[16:19], 0 offen
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX908-NEXT:    v_mov_b32_e32 v4, s6
+; GFX908-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
@@ -1932,12 +1959,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s20
-; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v8, s[16:19], 0 offen
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX8-NEXT:    v_max_f64 v[2:3], v[0:1], v[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
@@ -1979,16 +2010,22 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], null offen
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -2019,16 +2056,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2071,9 +2113,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v3
@@ -2100,9 +2146,13 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
index 876eacb763695..ea5a9b0d22a15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
 
 ; TODO: Delete this and add run lines to use *-atomicrmw-fmin.ll tests
 
@@ -1509,13 +1509,15 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX942:       ; %bb.0:
 ; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v2, s16
-; GFX942-NEXT:    v_mov_b32_e32 v1, v0
-; GFX942-NEXT:    buffer_load_dword v0, v2, s[0:3], 0 offen
+; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
-; GFX942-NEXT:    v_max_f32_e32 v3, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX942-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX942-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX942-NEXT:    v_min_f32_e32 v4, v0, v3
@@ -1559,13 +1561,14 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s20
-; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
-; GFX90A-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
-; GFX90A-NEXT:    v_max_f32_e32 v3, v1, v1
+; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX90A-NEXT:    v_min_f32_e32 v4, v0, v3
@@ -1585,13 +1588,14 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s20
-; GFX908-NEXT:    v_mov_b32_e32 v1, v0
-; GFX908-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
-; GFX908-NEXT:    v_max_f32_e32 v3, v1, v1
+; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX908-NEXT:    v_max_f32_e32 v0, v5, v5
 ; GFX908-NEXT:    v_min_f32_e32 v4, v0, v3
@@ -1612,13 +1616,14 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    v_mov_b32_e32 v1, v0
-; GFX8-NEXT:    buffer_load_dword v0, v2, s[16:19], 0 offen
+; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
-; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v1
+; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v5
 ; GFX8-NEXT:    v_min_f32_e32 v4, v0, v3
@@ -1669,9 +1674,12 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX942-NEXT:    buffer_load_dword v1, v2, s[0:3], 0 offen
 ; GFX942-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX942-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX942-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX942-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX942-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
@@ -1718,9 +1726,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX90A-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX90A-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
@@ -1743,9 +1753,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX908-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:    v_max_f32_e32 v3, v0, v0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX908-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f32_e32 v0, v1, v1
 ; GFX908-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX908-NEXT:    v_mov_b32_e32 v5, v1
@@ -1769,9 +1781,11 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
 ; GFX8-NEXT:    buffer_load_dword v1, v2, s[16:19], 0 offen
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 1.0, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s6
 ; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 1.0, v1
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
@@ -1810,17 +1824,22 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX12-NEXT:    v_max_num_f64_e32 v[6:7], v[0:1], v[0:1]
+; GFX12-NEXT:    buffer_load_b64 v[2:3], v8, s[0:3], null offen
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX12-NEXT:    s_mov_b32 s4, 0
-; GFX12-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], null offen
 ; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[4:5], v[4:5]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[2:3], v[0:1], v[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_mov_b32_e32 v0, v2
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
 ; GFX12-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1852,17 +1871,21 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v8, s16
 ; GFX11-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
+; GFX11-NEXT:    buffer_load_b64 v[2:3], v8, s[0:3], 0 offen
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
 ; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    buffer_load_b64 v[4:5], v8, s[0:3], 0 offen
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v5
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[0:3], v8, s[0:3], 0 offen glc
@@ -1903,12 +1926,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v8, s20
-; GFX908-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v8, s[16:19], 0 offen
 ; GFX908-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX908-NEXT:    v_mov_b32_e32 v4, s6
+; GFX908-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX908-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v2
@@ -1932,12 +1959,16 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v8, s20
-; GFX8-NEXT:    buffer_load_dwordx2 v[4:5], v8, s[16:19], 0 offen
+; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v8, s[16:19], 0 offen
 ; GFX8-NEXT:    v_max_f64 v[6:7], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[4:5], v[4:5]
 ; GFX8-NEXT:    v_min_f64 v[2:3], v[0:1], v[6:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v2
@@ -1979,16 +2010,22 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX12-NEXT:    v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
-; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], null offen
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX12-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX12-NEXT:    s_mov_b32 s4, 0
 ; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX12-NEXT:    v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
 ; GFX12-NEXT:    s_wait_storecnt 0x0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX12-NEXT:    v_min_num_f64_e32 v[0:1], v[0:1], v[4:5]
 ; GFX12-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX12-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
 ; GFX12-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
@@ -2019,16 +2056,21 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v6, s16
 ; GFX11-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
-; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    buffer_load_b64 v[2:3], v6, s[0:3], 0 offen
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX11-NEXT:    v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
 ; GFX11-NEXT:    buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -2071,9 +2113,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX908-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
 ; GFX908-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX908-NEXT:    v_mov_b32_e32 v2, s6
+; GFX908-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX908-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v3
@@ -2100,9 +2146,13 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
 ; GFX8-NEXT:    buffer_load_dwordx2 v[2:3], v6, s[16:19], 0 offen
 ; GFX8-NEXT:    v_max_f64 v[4:5], v[0:1], v[0:1]
 ; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX8-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], v[2:3], v[2:3]
 ; GFX8-NEXT:    v_min_f64 v[0:1], v[0:1], v[4:5]
 ; GFX8-NEXT:    v_mov_b32_e32 v10, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll
index 1915338f0ce20..2efe109c30b6e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.softwqm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_ps float @softwqm_f32(float %val) {
   ; GCN-LABEL: name: softwqm_f32
@@ -77,6 +77,74 @@ define amdgpu_ps <3 x float> @softwqm_v3f32(<3 x float> %val) {
   ret <3 x float> %ret
 }
 
+define amdgpu_ps float @s_softwqm_f32(float inreg %val) {
+  ; GCN-LABEL: name: s_softwqm_f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[SOFT_WQM:%[0-9]+]]:sreg_32 = SOFT_WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[SOFT_WQM]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = call float @llvm.amdgcn.softwqm.f32(float %val)
+  ret float %ret
+}
+
+define amdgpu_ps float @s_softwqm_v2f16(float inreg %arg) {
+  ; GCN-LABEL: name: s_softwqm_v2f16
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[SOFT_WQM:%[0-9]+]]:sreg_32 = SOFT_WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[SOFT_WQM]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %val = bitcast float %arg to <2 x half>
+  %ret = call <2 x half> @llvm.amdgcn.softwqm.v2f16(<2 x half> %val)
+  %bc = bitcast <2 x half> %ret to float
+  ret float %bc
+}
+
+define amdgpu_ps <2 x float> @s_softwqm_f64(double inreg %val) {
+  ; GCN-LABEL: name: s_softwqm_f64
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GCN-NEXT:   [[SOFT_WQM:%[0-9]+]]:sreg_64 = SOFT_WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[SOFT_WQM]].sub0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[SOFT_WQM]].sub1
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY2]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY3]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %ret = call double @llvm.amdgcn.softwqm.f64(double %val)
+  %bitcast = bitcast double %ret to <2 x float>
+  ret <2 x float> %bitcast
+}
+
+define amdgpu_ps <3 x float> @s_softwqm_v3f32(<3 x float> inreg %val) {
+  ; GCN-LABEL: name: s_softwqm_v3f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+  ; GCN-NEXT:   [[SOFT_WQM:%[0-9]+]]:sgpr_96 = SOFT_WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[SOFT_WQM]].sub0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[SOFT_WQM]].sub1
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[SOFT_WQM]].sub2
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY3]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY4]]
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY5]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %ret = call <3 x float> @llvm.amdgcn.softwqm.v3f32(<3 x float> %val)
+  ret <3 x float> %ret
+}
+
 declare i1 @llvm.amdgcn.softwqm.i1(i1) #0
 declare float @llvm.amdgcn.softwqm.f32(float) #0
 declare <2 x half> @llvm.amdgcn.softwqm.v2f16(<2 x half>) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.strictwqm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.strictwqm.ll
new file mode 100644
index 0000000000000..3c0dbb5945da9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.strictwqm.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
+
+define amdgpu_ps float @strictwqm_f32(float %val) {
+  ; GCN-LABEL: name: strictwqm_f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   early-clobber %2:vgpr_32 = STRICT_WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY %2
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = call float @llvm.amdgcn.strict.wqm.f32(float %val)
+  ret float %ret
+}
+
+define amdgpu_ps float @s_strictwqm_f32(float inreg %val) {
+  ; GCN-LABEL: name: s_strictwqm_f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   early-clobber %2:sreg_32 = STRICT_WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY %2
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = call float @llvm.amdgcn.strict.wqm.f32(float %val)
+  ret float %ret
+}
+
+define amdgpu_ps float @strictwqm_v2f16(float %arg) {
+  ; GCN-LABEL: name: strictwqm_v2f16
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $vgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   early-clobber %3:vgpr_32 = STRICT_WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY %3
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %val = bitcast float %arg to <2 x half>
+  %ret = call <2 x half> @llvm.amdgcn.strict.wqm.v2f16(<2 x half> %val)
+  %bc = bitcast <2 x half> %ret to float
+  ret float %bc
+}
+
+define amdgpu_ps float @s_strictwqm_v2f16(float inreg %arg) {
+  ; GCN-LABEL: name: s_strictwqm_v2f16
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   early-clobber %3:sreg_32 = STRICT_WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY %3
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %val = bitcast float %arg to <2 x half>
+  %ret = call <2 x half> @llvm.amdgcn.strict.wqm.v2f16(<2 x half> %val)
+  %bc = bitcast <2 x half> %ret to float
+  ret float %bc
+}
+
+define amdgpu_ps <2 x float> @strictwqm_f64(double %val) {
+  ; GCN-LABEL: name: strictwqm_f64
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GCN-NEXT:   early-clobber %4:vreg_64 = STRICT_WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY %4.sub0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY %4.sub1
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY2]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY3]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %ret = call double @llvm.amdgcn.strict.wqm.f64(double %val)
+  %bitcast = bitcast double %ret to <2 x float>
+  ret <2 x float> %bitcast
+}
+
+define amdgpu_ps <2 x float> @s_strictwqm_f64(double inreg %val) {
+  ; GCN-LABEL: name: s_strictwqm_f64
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GCN-NEXT:   early-clobber %4:sreg_64 = STRICT_WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY %4.sub0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY %4.sub1
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY2]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY3]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %ret = call double @llvm.amdgcn.strict.wqm.f64(double %val)
+  %bitcast = bitcast double %ret to <2 x float>
+  ret <2 x float> %bitcast
+}
+
+define amdgpu_ps <3 x float> @strictwqm_v3f32(<3 x float> %val) {
+  ; GCN-LABEL: name: strictwqm_v3f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+  ; GCN-NEXT:   early-clobber %5:vreg_96 = STRICT_WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY %5.sub0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY %5.sub1
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY %5.sub2
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY3]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY4]]
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY5]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %ret = call <3 x float> @llvm.amdgcn.strict.wqm.v3f32(<3 x float> %val)
+  ret <3 x float> %ret
+}
+
+define amdgpu_ps <3 x float> @s_strictwqm_v3f32(<3 x float> inreg %val) {
+  ; GCN-LABEL: name: s_strictwqm_v3f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+  ; GCN-NEXT:   early-clobber %5:sgpr_96 = STRICT_WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY %5.sub0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY %5.sub1
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY %5.sub2
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY3]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY4]]
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY5]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %ret = call <3 x float> @llvm.amdgcn.strict.wqm.v3f32(<3 x float> %val)
+  ret <3 x float> %ret
+}
+
+declare float @llvm.amdgcn.strict.wqm.f32(float) #0
+declare <2 x half> @llvm.amdgcn.strict.wqm.v2f16(<2 x half>) #0
+declare <3 x float> @llvm.amdgcn.strict.wqm.v3f32(<3 x float>) #0
+declare double @llvm.amdgcn.strict.wqm.f64(double) #0
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll
index a25e1f2aa6b87..dd93aa072ae0d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii -stop-after=instruction-select < %s | FileCheck -check-prefix=GCN %s
 
 define amdgpu_ps float @wqm_f32(float %val) {
   ; GCN-LABEL: name: wqm_f32
@@ -77,6 +77,75 @@ define amdgpu_ps <3 x float> @wqm_v3f32(<3 x float> %val) {
   ret <3 x float> %ret
 }
 
+define amdgpu_ps float @s_wqm_f32(float inreg %val) {
+  ; GCN-LABEL: name: s_wqm_f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[WQM:%[0-9]+]]:sreg_32 = WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[WQM]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ret = call float @llvm.amdgcn.wqm.f32(float %val)
+  ret float %ret
+}
+
+define amdgpu_ps float @s_wqm_v2f16(float inreg %arg) {
+  ; GCN-LABEL: name: s_wqm_v2f16
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[WQM:%[0-9]+]]:sreg_32 = WQM [[COPY]], implicit $exec
+  ; GCN-NEXT:   $vgpr0 = COPY [[WQM]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %val = bitcast float %arg to <2 x half>
+  %ret = call <2 x half> @llvm.amdgcn.wqm.v2f16(<2 x half> %val)
+  %bc = bitcast <2 x half> %ret to float
+  ret float %bc
+}
+
+define amdgpu_ps <2 x float> @s_wqm_f64(double inreg %val) {
+  ; GCN-LABEL: name: s_wqm_f64
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GCN-NEXT:   [[WQM:%[0-9]+]]:sreg_64 = WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY [[WQM]].sub0
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[WQM]].sub1
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY2]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY3]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
+  %ret = call double @llvm.amdgcn.wqm.f64(double %val)
+  %bitcast = bitcast double %ret to <2 x float>
+  ret <2 x float> %bitcast
+}
+
+define amdgpu_ps <3 x float> @s_wqm_v3f32(<3 x float> inreg %val) {
+  ; GCN-LABEL: name: s_wqm_v3f32
+  ; GCN: bb.1 (%ir-block.0):
+  ; GCN-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; GCN-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; GCN-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
+  ; GCN-NEXT:   [[WQM:%[0-9]+]]:sgpr_96 = WQM [[REG_SEQUENCE]], implicit $exec
+  ; GCN-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY [[WQM]].sub0
+  ; GCN-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY [[WQM]].sub1
+  ; GCN-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY [[WQM]].sub2
+  ; GCN-NEXT:   $vgpr0 = COPY [[COPY3]]
+  ; GCN-NEXT:   $vgpr1 = COPY [[COPY4]]
+  ; GCN-NEXT:   $vgpr2 = COPY [[COPY5]]
+  ; GCN-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
+  %ret = call <3 x float> @llvm.amdgcn.wqm.v3f32(<3 x float> %val)
+  ret <3 x float> %ret
+}
+
+
 declare i1 @llvm.amdgcn.wqm.i1(i1) #0
 declare float @llvm.amdgcn.wqm.f32(float) #0
 declare <2 x half> @llvm.amdgcn.wqm.v2f16(<2 x half>) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.mir
index d5a0e03acb361..e75dc7bcfa36f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.wqm.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-fast -verify-machineinstrs -o - %s | FileCheck %s
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' %s -o - | FileCheck %s
 
 ---
 name: wqm_s
@@ -13,8 +12,7 @@ body: |
     ; CHECK: liveins: $sgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wqm), [[COPY1]](s32)
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wqm), [[COPY]](s32)
     %0:_(s32) = COPY $sgpr0
     %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wqm), %0
 ...
@@ -34,3 +32,67 @@ body: |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wqm), %0
 ...
+
+---
+name: strict_wqm_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: strict_wqm_s
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.strict.wqm), [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.strict.wqm), %0
+...
+
+---
+name: strict_wqm_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: strict_wqm_v
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.strict.wqm), [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.strict.wqm), %0
+...
+
+---
+name: softwqm_s
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+    ; CHECK-LABEL: name: softwqm_s
+    ; CHECK: liveins: $sgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:sgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.softwqm), [[COPY]](s32)
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.softwqm), %0
+...
+
+---
+name: softwqm_v
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+    ; CHECK-LABEL: name: softwqm_v
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.softwqm), [[COPY]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.softwqm), %0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir
index 57f32580d44a3..632e19ac7b018 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgpu-ffbh-u32.mir
@@ -64,11 +64,11 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY]](s64)
-    ; CHECK-NEXT: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV]](s32)
-    ; CHECK-NEXT: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U32_:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV1]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:vgpr(s32) = G_AMDGPU_FFBH_U32 [[UV]](s32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[UADDSAT:%[0-9]+]]:vgpr(s32) = G_UADDSAT [[AMDGPU_FFBH_U32_]], [[C]]
-    ; CHECK-NEXT: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[UADDSAT]]
+    ; CHECK-NEXT: [[UADDSAT:%[0-9]+]]:vgpr(s32) = G_UADDSAT [[AMDGPU_FFBH_U32_1]], [[C]]
+    ; CHECK-NEXT: [[UMIN:%[0-9]+]]:vgpr(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[UADDSAT]]
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s32) = G_AMDGPU_FFBH_U32 %0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
index d26d406df5220..49ec6b97d7581 100644
--- a/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/barrier-elimination-gfx12.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
 
 define amdgpu_kernel void @signal_unknown_wgs() {
 ; CHECK-LABEL: signal_unknown_wgs:
diff --git a/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll b/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll
index fed4d9b6a373f..87bc47defce81 100644
--- a/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/barrier-elimination.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn < %s -global-isel | FileCheck %s
+; RUN: llc -mtriple=amdgcn < %s -global-isel -new-reg-bank-select | FileCheck %s
 
 ; CHECK-LABEL: {{^}}unknown_wgs:
 ; CHECK: s_barrier
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 26e1b66161a2a..0f459959310e1 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=EG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
 
 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
 declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
@@ -655,10 +655,12 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s2, 24
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s3, s3
+; GFX9-GISEL-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s3, 32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -761,12 +763,17 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s2, 16
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s3, s3
+; GFX9-GISEL-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s3, 32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
@@ -876,18 +883,25 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s5, s4
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s2, s2
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s2, 32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1051,37 +1065,51 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:7
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v2, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v2, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v2, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v2, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v2, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v2, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v2, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v2, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v3, v5, v6, v4
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v4, v2
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
-; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s10, s10, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s4, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s6, s5
+; GFX9-GISEL-NEXT:    s_or_b32 s5, s8, s7
+; GFX9-GISEL-NEXT:    s_or_b32 s6, s10, s9
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s5, s6, s5
+; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s2, s[4:5]
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], 64
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
@@ -1365,16 +1393,16 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v3, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
-; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v1, v0
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v3, v[1:2], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
@@ -2433,13 +2461,15 @@ define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, s4, 16, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
   ret <3 x i16> %ctlz
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index cbfe99a4e7faf..cbfda51a61c29 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI %s
 ; RUN: llc -mtriple=r600 -mcpu=cypress < %s | FileCheck -check-prefixes=EG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
 
 declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
 declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
@@ -628,9 +628,11 @@ define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX9-GISEL-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s3, 32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -727,11 +729,16 @@ define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX9-GISEL-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s3, 32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
@@ -841,18 +848,25 @@ define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s5, s4
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s2, s2
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s2, 32
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1016,37 +1030,51 @@ define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) no
 ; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
-; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
-; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
-; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:7
-; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v0, v2, s[2:3]
+; GFX9-GISEL-NEXT:    global_load_ubyte v1, v2, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v2, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v2, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    global_load_ubyte v5, v2, s[2:3] offset:4
+; GFX9-GISEL-NEXT:    global_load_ubyte v6, v2, s[2:3] offset:5
+; GFX9-GISEL-NEXT:    global_load_ubyte v7, v2, s[2:3] offset:6
+; GFX9-GISEL-NEXT:    global_load_ubyte v8, v2, s[2:3] offset:7
+; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s5, v3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s7, v5
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s8, v6
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s9, v7
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
-; GFX9-GISEL-NEXT:    v_or3_b32 v3, v5, v6, v4
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v4, v3
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v2
-; GFX9-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
-; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s10, v8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s6, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s10, s10, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s9, s9, 16
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s4, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s6, s5
+; GFX9-GISEL-NEXT:    s_or_b32 s5, s8, s7
+; GFX9-GISEL-NEXT:    s_or_b32 s6, s10, s9
+; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s5, s6, s5
+; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s2, s[4:5]
+; GFX9-GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9-GISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], 64
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i64, ptr addrspace(1) %arrayidx, align 1
   %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
@@ -1155,19 +1183,27 @@ define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s5, s4
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX9-GISEL-NEXT:    s_min_u32 s3, s3, 32
+; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, -1, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1277,19 +1313,27 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out,
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
-; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s5, s4
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s3, s2
+; GFX9-GISEL-NEXT:    s_min_u32 s3, s3, 32
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s3, -1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1407,19 +1451,27 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
-; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
-; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:2
+; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:3
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s4, v3
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
-; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
-; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_lshl_b32 s5, s5, 24
+; GFX9-GISEL-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s5, s4
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s2, s2
+; GFX9-GISEL-NEXT:    s_min_u32 s2, s2, 32
+; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 32
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, s2, -1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i32, ptr addrspace(1) %arrayidx, align 1
@@ -1494,14 +1546,16 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
-; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v1, v0 src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s2, 0x100
+; GFX9-GISEL-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s3, s3
+; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 0xffff, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i8, ptr addrspace(1) %arrayidx, align 1
@@ -1588,16 +1642,21 @@ define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
-; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
-; GFX9-GISEL-NEXT:    v_cmp_eq_u16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX9-GISEL-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
+; GFX9-GISEL-NEXT:    s_or_b32 s3, s2, 0x10000
+; GFX9-GISEL-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s3, s3
+; GFX9-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 0xffff, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %arrayidx, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
index 77029d109a6be..14543fcf6ce88 100644
--- a/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/dst-sel-hazard.mir
@@ -370,7 +370,7 @@ body:            |
     ; HAZARD-LABEL: name: inline_sdwa_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_ENDPGM 0
@@ -378,10 +378,10 @@ body:            |
     ; NOHAZARD-LABEL: name: inline_sdwa_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
   S_ENDPGM 0
 ...
@@ -397,17 +397,17 @@ body:            |
     ; HAZARD-NEXT: {{  $}}
     ; HAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: sdwa_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
     ; NOHAZARD-NEXT: renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
   renamable $vgpr0 = V_ADD_U16_sdwa 0, $vgpr1, 0, $vgpr2, 0, 1, 0, 3, 3, implicit $exec, implicit killed $vgpr0(tied-def 0)
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
   S_ENDPGM 0
 ...
 
@@ -421,19 +421,19 @@ body:            |
     ; HAZARD-LABEL: name: inline_inline_hazard
     ; HAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; HAZARD-NEXT: {{  $}}
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
     ; HAZARD-NEXT: S_NOP 0
-    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; HAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
     ; HAZARD-NEXT: S_ENDPGM 0
     ;
     ; NOHAZARD-LABEL: name: inline_inline_hazard
     ; NOHAZARD: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $exec, $mode
     ; NOHAZARD-NEXT: {{  $}}
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
-    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+    ; NOHAZARD-NEXT: INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
     ; NOHAZARD-NEXT: S_ENDPGM 0
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
-  INLINEASM &"v_or_b32 $0, 0, $1", 0 /* attdialect */, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
+  INLINEASM &"v_or_b32 $0, 0, $1", attdialect, regdef:VGPR_32, def $vgpr0, reguse:VGPR_32, $vgpr1
   S_ENDPGM 0
 ...
 
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
index f3dd4cbeda16d..e309d36f6fb05 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-atomics.ll
@@ -6,12 +6,12 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=GFX1100
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
 
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=G_GFX1100
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=G_GFX1100
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
 
 declare float @llvm.amdgcn.raw.buffer.atomic.fmin.f32(float, <4 x i32>, i32, i32, i32 immarg)
 declare float @llvm.amdgcn.raw.buffer.atomic.fmax.f32(float, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
index fd4046a70687d..50b15260cd3c8 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-buffer-ptr-atomics.ll
@@ -5,11 +5,11 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=GFX1100
 
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=G_GFX1100
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=G_GFX1100
 
 declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
 declare float @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
index 2d38924a28da4..e987190e335d3 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-atomics.ll
@@ -4,10 +4,10 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
 
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
 
 declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
 declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
index b8363da4e4887..2a7027c3393b2 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-min-max-buffer-ptr-atomics.ll
@@ -4,10 +4,10 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=GFX1030
 
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
-; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=verde | FileCheck %s -check-prefix=G_SI
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=hawaii | FileCheck %s  -check-prefix=G_GFX7
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefix=G_GFX10
+; RUN: llc < %s -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1030 | FileCheck %s -check-prefix=G_GFX1030
 
 declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
 declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
index 673a3fae1e59e..d5dfb505cd7ca 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-arg-v5-opt.ll
@@ -291,6 +291,179 @@ define amdgpu_kernel void @get_remainder_x_existing_range(ptr addrspace(1) %out)
   ret void
 }
 
+define i16 @get_grid_dims_i16() #2 {
+; GCN-LABEL: @get_grid_dims_i16(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i16, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4, !range [[RNG5:![0-9]+]]
+; GCN-NEXT:    ret i16 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i16, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i16 %grid.dims
+}
+
+; Ignore wrong type
+define half @get_grid_dims_f16() #2 {
+; GCN-half: @get_grid_dims_i16(
+; GCN-LABEL: @get_grid_dims_f16(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load half, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4
+; GCN-NEXT:    ret half [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load half, ptr addrspace(4) %gep.grid.dims, align 2
+  ret half %grid.dims
+}
+
+; Undersized, OK
+define i8 @get_grid_dims_i8() #2 {
+; GCN-LABEL: @get_grid_dims_i8(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i8, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4, !range [[RNG6:![0-9]+]]
+; GCN-NEXT:    ret i8 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i8, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i8 %grid.dims
+}
+
+define i1 @get_grid_dims_i1() #2 {
+; GCN-LABEL: @get_grid_dims_i1(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i1, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4
+; GCN-NEXT:    ret i1 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i1, ptr addrspace(4) %gep.grid.dims, align 1
+  ret i1 %grid.dims
+}
+
+; Undersized, theoretically ok but would require special case
+; construction of the wrapped range.
+define i2 @get_grid_dims_i2() #2 {
+; GCN-LABEL: @get_grid_dims_i2(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i2, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4
+; GCN-NEXT:    ret i2 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i2, ptr addrspace(4) %gep.grid.dims, align 1
+  ret i2 %grid.dims
+}
+
+define i3 @get_grid_dims_i3() #2 {
+; GCN-LABEL: @get_grid_dims_i3(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i3, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4, !range [[RNG7:![0-9]+]]
+; GCN-NEXT:    ret i3 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i3, ptr addrspace(4) %gep.grid.dims, align 1
+  ret i3 %grid.dims
+}
+
+; Oversized, ignore
+define i32 @get_grid_dims_i32() #2 {
+; GCN-LABEL: @get_grid_dims_i32(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i32, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4
+; GCN-NEXT:    ret i32 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i32, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i32 %grid.dims
+}
+
+define i16 @get_grid_dims_reqd_work_group_size_1d() #2 !reqd_work_group_size !2 {
+; GCN-LABEL: @get_grid_dims_reqd_work_group_size_1d(
+; GCN-NEXT:    ret i16 1
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i16, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i16 %grid.dims
+}
+
+define i16 @get_grid_dims_reqd_work_group_size_2d() #2 !reqd_work_group_size !3 {
+; GCN-LABEL: @get_grid_dims_reqd_work_group_size_2d(
+; GCN-NEXT:    ret i16 2
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i16, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i16 %grid.dims
+}
+
+define i16 @get_grid_dims_reqd_work_group_size_2d_weird() #2 !reqd_work_group_size !5 {
+; GCN-LABEL: @get_grid_dims_reqd_work_group_size_2d_weird(
+; GCN-NEXT:    ret i16 2
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i16, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i16 %grid.dims
+}
+
+define i16 @get_grid_dims_reqd_work_group_size_3d() #2 !reqd_work_group_size !0 {
+; GCN-LABEL: @get_grid_dims_reqd_work_group_size_3d(
+; GCN-NEXT:    ret i16 3
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i16, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i16 %grid.dims
+}
+
+define i16 @get_grid_dims_reqd_work_group_size_3d_weird() #2 !reqd_work_group_size !4 {
+; GCN-LABEL: @get_grid_dims_reqd_work_group_size_3d_weird(
+; GCN-NEXT:    ret i16 3
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i16, ptr addrspace(4) %gep.grid.dims, align 2
+  ret i16 %grid.dims
+}
+
+define i1 @get_grid_dims_i1_reqd_work_group_size() #2 !reqd_work_group_size !3 {
+; GCN-LABEL: @get_grid_dims_i1_reqd_work_group_size(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i1, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4
+; GCN-NEXT:    ret i1 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i1, ptr addrspace(4) %gep.grid.dims, align 1
+  ret i1 %grid.dims
+}
+
+define i16 @get_grid_dims_existing_range() #2 {
+; GCN-LABEL: @get_grid_dims_existing_range(
+; GCN-NEXT:    [[IMPLICITARG_PTR:%.*]] = tail call dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+; GCN-NEXT:    [[GEP_GRID_DIMS:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(4) [[IMPLICITARG_PTR]], i64 64
+; GCN-NEXT:    [[GRID_DIMS:%.*]] = load i16, ptr addrspace(4) [[GEP_GRID_DIMS]], align 4, !range [[RNG12:![0-9]+]]
+; GCN-NEXT:    ret i16 [[GRID_DIMS]]
+;
+  %implicitarg.ptr = tail call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+  %gep.grid.dims = getelementptr inbounds i8, ptr addrspace(4) %implicitarg.ptr, i64 64
+  %grid.dims = load i16, ptr addrspace(4) %gep.grid.dims, align 2, !range !{i16 1, i16 2}
+  ret i16 %grid.dims
+}
+
 declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #1
 declare i32 @llvm.amdgcn.workgroup.id.x() #1
 declare i32 @llvm.amdgcn.workgroup.id.y() #1
@@ -303,8 +476,18 @@ attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { nounwind }
 !0 = !{i32 8, i32 16, i32 2}
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+
+!2 = !{i32 64, i32 1, i32 1}
+!3 = !{i32 32, i32 4, i32 1}
+!4 = !{i32 32, i32 1, i32 2}
+!5 = !{i32 1, i32 32, i32 1}
+
 ;.
 ; GCN: [[RNG1]] = !{i16 0, i16 1024}
 ; GCN: [[RNG2]] = !{i16 1, i16 1025}
 ; GCN: [[RNG4]] = !{i16 0, i16 10}
+; GCN: [[RNG5]] = !{i16 1, i16 4}
+; GCN: [[RNG6]] = !{i8 1, i8 4}
+; GCN: [[RNG7]] = !{i3 1, i3 -4}
+; GCN: [[RNG12]] = !{i16 1, i16 2}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index abb6ccc5faadb..81b9aae775ed8 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -246,6 +246,9 @@
 ; GCN-O1-NEXT:      Dominator Tree Construction
 ; GCN-O1-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O1-NEXT:      Natural Loop Information
+; GCN-O1-NEXT:      Post-Dominator Tree Construction
+; GCN-O1-NEXT:      Branch Probability Analysis
+; GCN-O1-NEXT:      Block Frequency Analysis
 ; GCN-O1-NEXT:      CodeGen Prepare
 ; GCN-O1-NEXT:    Lower buffer fat pointer operations to buffer resources
 ; GCN-O1-NEXT:    AMDGPU lower intrinsics
@@ -552,6 +555,9 @@
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
+; GCN-O1-OPTS-NEXT:      Post-Dominator Tree Construction
+; GCN-O1-OPTS-NEXT:      Branch Probability Analysis
+; GCN-O1-OPTS-NEXT:      Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:      CodeGen Prepare
 ; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Basic Alias Analysis (stateless AA impl)
@@ -874,6 +880,9 @@
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O2-NEXT:      Natural Loop Information
+; GCN-O2-NEXT:      Post-Dominator Tree Construction
+; GCN-O2-NEXT:      Branch Probability Analysis
+; GCN-O2-NEXT:      Block Frequency Analysis
 ; GCN-O2-NEXT:      CodeGen Prepare
 ; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Basic Alias Analysis (stateless AA impl)
@@ -1211,6 +1220,9 @@
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      AMDGPU Lower Kernel Arguments
 ; GCN-O3-NEXT:      Natural Loop Information
+; GCN-O3-NEXT:      Post-Dominator Tree Construction
+; GCN-O3-NEXT:      Branch Probability Analysis
+; GCN-O3-NEXT:      Block Frequency Analysis
 ; GCN-O3-NEXT:      CodeGen Prepare
 ; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Basic Alias Analysis (stateless AA impl)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
index a57b43a81205b..20e60907043ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950-SDAG %s
-; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
 
 declare <2 x i32>    @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3))
 declare <2 x i32>    @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3))
@@ -135,23 +135,14 @@ entry:
 }
 
 define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
-; GFX950-SDAG-LABEL: ds_read_b64_tr_b16_v4bf16:
-; GFX950-SDAG:       ; %bb.0: ; %entry
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v1
-; GFX950-SDAG-NEXT:    ds_read_b64_tr_b16 v[0:1], v0 offset:32
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX950-SDAG-NEXT:    s_endpgm
-;
-; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16:
-; GFX950-GISEL:       ; %bb.0: ; %entry
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v2
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v1
-; GFX950-GISEL-NEXT:    ds_read_b64_tr_b16 v[0:1], v0 offset:32
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GFX950-GISEL-NEXT:    s_endpgm
+; GFX950-LABEL: ds_read_b64_tr_b16_v4bf16:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    v_mov_b32_e32 v3, v2
+; GFX950-NEXT:    v_mov_b32_e32 v2, v1
+; GFX950-NEXT:    ds_read_b64_tr_b16 v[0:1], v0 offset:32
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX950-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
   %val = call <4 x bfloat> @llvm.amdgcn.ds.read.tr16.b64.v4bf16.p3(ptr addrspace(3) %gep)
@@ -159,30 +150,79 @@ entry:
   ret void
 }
 
+define amdgpu_ps void @ds_read_b64_tr_b4_s(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
+; GFX950-LABEL: ds_read_b64_tr_b4_s:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    ds_read_b64_tr_b4 v[2:3], v2 offset:32
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX950-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+  %val = call <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3) %gep)
+  store <2 x i32> %val, ptr addrspace(1) %use
+  ret void
+}
+
+define amdgpu_ps void @ds_read_b96_tr_b6_s(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
+; GFX950-LABEL: ds_read_b96_tr_b6_s:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    ds_read_b96_tr_b6 v[2:4], v2 offset:32
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    global_store_dwordx3 v[0:1], v[2:4], off
+; GFX950-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+  %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
+  store <3 x i32> %val, ptr addrspace(1) %use
+  ret void
+}
+
+define amdgpu_ps void @ds_read_b64_tr_b8_s(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
+; GFX950-LABEL: ds_read_b64_tr_b8_s:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    ds_read_b64_tr_b8 v[2:3], v2 offset:32
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX950-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+  %val = call <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3) %gep)
+  store <2 x i32> %val, ptr addrspace(1) %use
+  ret void
+}
+
+define amdgpu_ps void @ds_read_b64_tr_b16_s(ptr addrspace(3) inreg %addr, ptr addrspace(1) %use) {
+; GFX950-LABEL: ds_read_b64_tr_b16_s:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    v_mov_b32_e32 v2, s0
+; GFX950-NEXT:    ds_read_b64_tr_b16 v[2:3], v2 offset:32
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX950-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4
+  %val = call <4 x i16> @llvm.amdgcn.ds.read.tr16.b64.v4i16.p3(ptr addrspace(3) %gep)
+  store <4 x i16> %val, ptr addrspace(1) %use
+  ret void
+}
+
 ; This is a special case that does not require aligned VGPRs. Make
 ; sure no copies are required for the unaligned ABI return value.
 define { i32, <3 x i32> } @ds_read_b96_tr_b6_no_align2_requirement(ptr addrspace(3) %ptr) {
-; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    ds_read_b96_tr_b6 v[2:4], v0 offset:32
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v1, v2
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v2, v3
-; GFX950-SDAG-NEXT:    v_mov_b32_e32 v3, v4
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    ds_read_b96_tr_b6 v[2:4], v0 offset:32
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v1, v2
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v2, v3
-; GFX950-GISEL-NEXT:    v_mov_b32_e32 v3, v4
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: ds_read_b96_tr_b6_no_align2_requirement:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    ds_read_b96_tr_b6 v[2:4], v0 offset:32
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, v2
+; GFX950-NEXT:    v_mov_b32_e32 v2, v3
+; GFX950-NEXT:    v_mov_b32_e32 v3, v4
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
   %insert0 = insertvalue { i32, <3 x i32> } poison, i32 0, 0
@@ -191,31 +231,18 @@ define { i32, <3 x i32> } @ds_read_b96_tr_b6_no_align2_requirement(ptr addrspace
 }
 
 define void @ds_read_b96_tr_b6_no_align2_requirement_agpr(ptr addrspace(3) %ptr) {
-; GFX950-SDAG-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
-; GFX950-SDAG:       ; %bb.0:
-; GFX950-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-SDAG-NEXT:    ds_read_b96_tr_b6 v[0:2], v0 offset:32
-; GFX950-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a2, v1
-; GFX950-SDAG-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX950-SDAG-NEXT:    ;;#ASMSTART
-; GFX950-SDAG-NEXT:    ; use a1 a2 a3
-; GFX950-SDAG-NEXT:    ;;#ASMEND
-; GFX950-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX950-GISEL-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
-; GFX950-GISEL:       ; %bb.0:
-; GFX950-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-GISEL-NEXT:    ds_read_b96_tr_b6 v[0:2], v0 offset:32
-; GFX950-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a1, v0
-; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a2, v1
-; GFX950-GISEL-NEXT:    v_accvgpr_write_b32 a3, v2
-; GFX950-GISEL-NEXT:    ;;#ASMSTART
-; GFX950-GISEL-NEXT:    ; use a1 a2 a3
-; GFX950-GISEL-NEXT:    ;;#ASMEND
-; GFX950-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX950-LABEL: ds_read_b96_tr_b6_no_align2_requirement_agpr:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    ds_read_b96_tr_b6 v[0:2], v0 offset:32
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX950-NEXT:    v_accvgpr_write_b32 a2, v1
+; GFX950-NEXT:    v_accvgpr_write_b32 a3, v2
+; GFX950-NEXT:    ;;#ASMSTART
+; GFX950-NEXT:    ; use a1 a2 a3
+; GFX950-NEXT:    ;;#ASMEND
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i64, ptr addrspace(3) %ptr, i32 4
   %val = call <3 x i32> @llvm.amdgcn.ds.read.tr6.b96.v3i32.p3(ptr addrspace(3) %gep)
   %val0 = extractelement <3 x i32> %val, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 139c698873d67..553ac88dfef89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -2,10 +2,11 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=VARIANT0 %s
 ; RUN: llc -mtriple=amdgcn -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT1 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=VARIANT2 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=VARIANT2-GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT3 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=VARIANT4 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+auto-waitcnt-before-barrier < %s | FileCheck --check-prefix=VARIANT5 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=VARIANT6 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefix=VARIANT6 %s
 
 define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT0-LABEL: test_barrier:
@@ -70,6 +71,28 @@ define amdgpu_kernel void @test_barrier(ptr addrspace(1) %out, i32 %size) #0 {
 ; VARIANT2-NEXT:    global_store_dword v2, v0, s[0:1]
 ; VARIANT2-NEXT:    s_endpgm
 ;
+; VARIANT2-GISEL-LABEL: test_barrier:
+; VARIANT2-GISEL:       ; %bb.0: ; %entry
+; VARIANT2-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; VARIANT2-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; VARIANT2-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VARIANT2-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; VARIANT2-GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; VARIANT2-GISEL-NEXT:    s_add_i32 s2, s2, -1
+; VARIANT2-GISEL-NEXT:    v_sub_u32_e32 v0, s2, v0
+; VARIANT2-GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; VARIANT2-GISEL-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; VARIANT2-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; VARIANT2-GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; VARIANT2-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; VARIANT2-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v1, vcc
+; VARIANT2-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VARIANT2-GISEL-NEXT:    s_barrier
+; VARIANT2-GISEL-NEXT:    global_load_dword v0, v[0:1], off
+; VARIANT2-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; VARIANT2-GISEL-NEXT:    global_store_dword v4, v0, s[0:1]
+; VARIANT2-GISEL-NEXT:    s_endpgm
+;
 ; VARIANT3-LABEL: test_barrier:
 ; VARIANT3:       ; %bb.0: ; %entry
 ; VARIANT3-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll
index d9dcea46debcc..621cda28b0ac3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sethalt.ll
@@ -25,4 +25,25 @@ define amdgpu_kernel void @test_s_sethalt() {
   ret void
 }
 
+define amdgpu_gs void @if_sethalt(i32 %flag) #0 {
+; GCN-LABEL: if_sethalt:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GCN-NEXT:    s_cbranch_execz .LBB1_2
+; GCN-NEXT:  ; %bb.1: ; %sethalt
+; GCN-NEXT:    s_sethalt 1
+; GCN-NEXT:  .LBB1_2: ; %end
+; GCN-NEXT:    s_endpgm
+  %cond = icmp eq i32 %flag, 0
+  br i1 %cond, label %sethalt, label %end
+
+sethalt:
+  call void @llvm.amdgcn.s.sethalt(i32 1)
+  br label %end
+
+end:
+  ret void
+}
+
 declare void @llvm.amdgcn.s.sethalt(i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
index fbccbd5ff29af..5e8a6ce749e96 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.event.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s
 
 ; GCN-LABEL: {{^}}test_wait_event_export_ready:
 ; GFX11: s_wait_event 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
index 0d7bab16be6b5..a291e30be7acd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
 
 define amdgpu_ps void @test_bvhcnt() {
 ; GFX12-LABEL: test_bvhcnt:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
index b0574bf021014..6de63908eceaa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX12
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefix=GFX12
 
 define amdgpu_ps void @test_asynccnt() {
 ; GFX12-LABEL: test_asynccnt:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
index d8ed6a171173f..75e6c585b4331 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK-NOT: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
index a2443bf25ef31..948e02fcb5a89 100644
--- a/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/memintrinsic-unroll.ll
@@ -9863,8 +9863,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; CHECK-NEXT:    s_andn2_saveexec_b32 s6, s6
 ; CHECK-NEXT:    s_cbranch_execz .LBB8_6
 ; CHECK-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
-; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; CHECK-NEXT:    s_movk_i32 s4, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s5, -1
 ; CHECK-NEXT:  .LBB8_5: ; %memmove_bwd_loop
@@ -11179,8 +11179,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; ALIGNED-NEXT:    s_andn2_saveexec_b32 s6, s6
 ; ALIGNED-NEXT:    s_cbranch_execz .LBB8_6
 ; ALIGNED-NEXT:  ; %bb.4: ; %memmove_bwd_loop.preheader
-; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; ALIGNED-NEXT:    v_add_nc_u32_e32 v0, 0x700, v0
+; ALIGNED-NEXT:    v_add_nc_u32_e32 v1, 0x700, v1
 ; ALIGNED-NEXT:    s_movk_i32 s4, 0xf800
 ; ALIGNED-NEXT:    s_mov_b32 s5, -1
 ; ALIGNED-NEXT:  .LBB8_5: ; %memmove_bwd_loop
@@ -12393,8 +12393,8 @@ define void @memmove_p5_p5_sz2048(ptr addrspace(5) align 1 %dst, ptr addrspace(5
 ; UNROLL3-NEXT:    buffer_load_dword v4, v1, s[0:3], 0 offen offset:2024
 ; UNROLL3-NEXT:    buffer_load_dword v5, v1, s[0:3], 0 offen offset:2020
 ; UNROLL3-NEXT:    buffer_load_dword v6, v1, s[0:3], 0 offen offset:2016
-; UNROLL3-NEXT:    v_add_nc_u32_e32 v1, 0x7b0, v1
 ; UNROLL3-NEXT:    v_add_nc_u32_e32 v2, 0x7b0, v0
+; UNROLL3-NEXT:    v_add_nc_u32_e32 v1, 0x7b0, v1
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(3)
 ; UNROLL3-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:2028
 ; UNROLL3-NEXT:    s_waitcnt vmcnt(2)
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
index b96907be2fa1d..e795751b57fb4 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-undef-def-with-other-subreg-defs.mir
@@ -28,9 +28,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, regdef:VGPR_32, def [[V_MOV_B32_e32_]], reguse tiedto:$0, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, reguse:VGPR_32, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub0, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub1
+  ; CHECK-NEXT:   INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def [[V_MOV_B32_e32_]], reguse tiedto:$0, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", sideeffect attdialect, reguse:VGPR_32, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub0, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -41,9 +41,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1114122 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1114121 /* reguse:VGPR_32 */, %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1114122 /* regdef:VGPR_32 */, def undef %0.sub0, 1114122 /* regdef:VGPR_32 */, def %0.sub1
+    INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def %0, reguse tiedto:$0, %0(tied-def 3)
+    INLINEASM &"", sideeffect attdialect, reguse:VGPR_32, %2
+    INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def undef %0.sub0, regdef:VGPR_32, def %0.sub1
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
@@ -69,9 +69,9 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (load (s32), addrspace 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, regdef:VGPR_32, def [[V_MOV_B32_e32_]], reguse tiedto:$0, [[V_MOV_B32_e32_]](tied-def 3)
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, reguse:VGPR_32, [[DS_READ_B32_gfx9_]]
-  ; CHECK-NEXT:   INLINEASM &"", 1 /* sideeffect attdialect */, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub1, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub0
+  ; CHECK-NEXT:   INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def [[V_MOV_B32_e32_]], reguse tiedto:$0, [[V_MOV_B32_e32_]](tied-def 3)
+  ; CHECK-NEXT:   INLINEASM &"", sideeffect attdialect, reguse:VGPR_32, [[DS_READ_B32_gfx9_]]
+  ; CHECK-NEXT:   INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub1, regdef:VGPR_32, def undef [[V_MOV_B32_e32_]].sub0
   ; CHECK-NEXT:   S_NOP 0, implicit [[V_MOV_B32_e32_]].sub1
   ; CHECK-NEXT:   $sgpr10 = S_MOV_B32 -1
   ; CHECK-NEXT:   S_BRANCH %bb.1
@@ -82,9 +82,9 @@ body:             |
 
   bb.1:
     %2:vgpr_32 = DS_READ_B32_gfx9 %1, 0, 0, implicit $exec :: (load (s32), addrspace 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1114122 /* regdef:VGPR_32 */, def %0, 2147483657 /* reguse tiedto:$0 */, %0(tied-def 3)
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1114121 /* reguse:VGPR_32 */, %2
-    INLINEASM &"", 1 /* sideeffect attdialect */, 1114122 /* regdef:VGPR_32 */, def %0.sub1, 1114122 /* regdef:VGPR_32 */, def undef %0.sub0
+    INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def %0, reguse tiedto:$0, %0(tied-def 3)
+    INLINEASM &"", sideeffect attdialect, reguse:VGPR_32, %2
+    INLINEASM &"", sideeffect attdialect, regdef:VGPR_32, def %0.sub1, regdef:VGPR_32, def undef %0.sub0
     S_NOP 0, implicit %0.sub1
     $sgpr10 = S_MOV_B32 -1
     S_BRANCH %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir
index 7ddf66873b62e..c88fe11c71e3d 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-coexecution-valu-hazards.mir
@@ -930,3 +930,29 @@ body: |
     $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_SWMMAC_I32_16X16X128_IU8_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, killed $vgpr32_vgpr33, 0, 0, 0, 0, 0, implicit $exec
     $vgpr32 = V_ADD_F32_e32 $vgpr34, $vgpr35, implicit $mode, implicit $exec
 ...
+
+---
+name: test_wmma_trans_B0_overlaps_D1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_trans_B0_overlaps_D1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: V_NOP_e32 implicit $exec
+    ; GFX1250-NEXT: $vgpr8 = V_EXP_F32_e32 $vgpr34, implicit $mode, implicit $exec
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
+    $vgpr8 = V_EXP_F32_e32 $vgpr34, implicit $mode, implicit $exec
+...
+
+---
+name: test_wmma_tdm_load_D0_overlaps_Use1
+body: |
+  bb.0:
+    ; GFX1250-LABEL: name: test_wmma_tdm_load_D0_overlaps_Use1
+    ; GFX1250: early-clobber $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
+    ; GFX1250-NEXT: GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+    $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X32_BF16_w32_twoaddr 8, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR killed $sgpr0_sgpr1, killed $vgpr24, killed $vgpr0, 16, 0, implicit-def dead $asynccnt, implicit $exec, implicit $asynccnt
+...
diff --git a/llvm/test/CodeGen/AMDGPU/write_register.ll b/llvm/test/CodeGen/AMDGPU/write_register.ll
index eaf1088a22b54..74d0779a43b7c 100644
--- a/llvm/test/CodeGen/AMDGPU/write_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/write_register.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=bonaire -enable-misched=0 < %s | FileCheck %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=bonaire -enable-misched=0 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -mtriple=amdgcn -mcpu=bonaire -enable-misched=0 < %s | FileCheck %s
 
 declare void @llvm.write_register.i32(metadata, i32) #0
 declare void @llvm.write_register.i64(metadata, i64) #0
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 98bb87524db44..db721b25f8e01 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -51,6 +51,9 @@
 ; CHECK-NEXT:      Complex Deinterleaving Pass
 ; CHECK-NEXT:      Interleaved Access Pass
 ; CHECK-NEXT:      Type Promotion
+; CHECK-NEXT:      Post-Dominator Tree Construction
+; CHECK-NEXT:      Branch Probability Analysis
+; CHECK-NEXT:      Block Frequency Analysis
 ; CHECK-NEXT:      CodeGen Prepare
 ; CHECK-NEXT:      Dominator Tree Construction
 ; CHECK-NEXT:      Exception handling preparation
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index 7cc623fb0a616..a44c92687466e 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -966,48 +966,15 @@ define i1 @scalar_i32_x_is_const2_eq(i32 %y) nounwind {
 ;------------------------------------------------------------------------------;
 
 define i1 @negative_scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
-; ARM6-LABEL: negative_scalar_i8_bitsinmiddle_slt:
-; ARM6:       @ %bb.0:
-; ARM6-NEXT:    uxtb r1, r1
-; ARM6-NEXT:    mov r2, #24
-; ARM6-NEXT:    ands r0, r0, r2, lsr r1
-; ARM6-NEXT:    mov r0, #0
-; ARM6-NEXT:    movmi r0, #1
-; ARM6-NEXT:    bx lr
-;
-; ARM78-LABEL: negative_scalar_i8_bitsinmiddle_slt:
-; ARM78:       @ %bb.0:
-; ARM78-NEXT:    uxtb r1, r1
-; ARM78-NEXT:    mov r2, #24
-; ARM78-NEXT:    ands r0, r0, r2, lsr r1
-; ARM78-NEXT:    mov r0, #0
-; ARM78-NEXT:    movwmi r0, #1
-; ARM78-NEXT:    bx lr
-;
-; THUMB6-LABEL: negative_scalar_i8_bitsinmiddle_slt:
-; THUMB6:       @ %bb.0:
-; THUMB6-NEXT:    uxtb r1, r1
-; THUMB6-NEXT:    movs r2, #24
-; THUMB6-NEXT:    lsrs r2, r1
-; THUMB6-NEXT:    ands r2, r0
-; THUMB6-NEXT:    bmi .LBB20_2
-; THUMB6-NEXT:  @ %bb.1:
-; THUMB6-NEXT:    movs r0, #0
-; THUMB6-NEXT:    bx lr
-; THUMB6-NEXT:  .LBB20_2:
-; THUMB6-NEXT:    movs r0, #1
-; THUMB6-NEXT:    bx lr
+; ARM-LABEL: negative_scalar_i8_bitsinmiddle_slt:
+; ARM:       @ %bb.0:
+; ARM-NEXT:    mov r0, #0
+; ARM-NEXT:    bx lr
 ;
-; THUMB78-LABEL: negative_scalar_i8_bitsinmiddle_slt:
-; THUMB78:       @ %bb.0:
-; THUMB78-NEXT:    uxtb r1, r1
-; THUMB78-NEXT:    movs r2, #24
-; THUMB78-NEXT:    lsr.w r1, r2, r1
-; THUMB78-NEXT:    ands r0, r1
-; THUMB78-NEXT:    mov.w r0, #0
-; THUMB78-NEXT:    it mi
-; THUMB78-NEXT:    movmi r0, #1
-; THUMB78-NEXT:    bx lr
+; THUMB-LABEL: negative_scalar_i8_bitsinmiddle_slt:
+; THUMB:       @ %bb.0:
+; THUMB-NEXT:    movs r0, #0
+; THUMB-NEXT:    bx lr
   %t0 = lshr i8 24, %y
   %t1 = and i8 %t0, %x
   %res = icmp slt i8 %t1, 0
diff --git a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
index a8421ae9a6a89..c3ad3fc0217df 100644
--- a/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/ARM/hoist-and-by-const-from-shl-in-eqcmp-zero.ll
@@ -1010,10 +1010,8 @@ define i1 @negative_scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
 ; ARM6-NEXT:    uxtb r1, r1
 ; ARM6-NEXT:    mov r2, #24
 ; ARM6-NEXT:    and r0, r0, r2, lsl r1
-; ARM6-NEXT:    sxtb r1, r0
-; ARM6-NEXT:    mov r0, #0
-; ARM6-NEXT:    cmp r1, #0
-; ARM6-NEXT:    movmi r0, #1
+; ARM6-NEXT:    mov r1, #1
+; ARM6-NEXT:    and r0, r1, r0, lsr #7
 ; ARM6-NEXT:    bx lr
 ;
 ; ARM78-LABEL: negative_scalar_i8_bitsinmiddle_slt:
@@ -1021,10 +1019,7 @@ define i1 @negative_scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
 ; ARM78-NEXT:    uxtb r1, r1
 ; ARM78-NEXT:    mov r2, #24
 ; ARM78-NEXT:    and r0, r0, r2, lsl r1
-; ARM78-NEXT:    sxtb r1, r0
-; ARM78-NEXT:    mov r0, #0
-; ARM78-NEXT:    cmp r1, #0
-; ARM78-NEXT:    movwmi r0, #1
+; ARM78-NEXT:    ubfx r0, r0, #7, #1
 ; ARM78-NEXT:    bx lr
 ;
 ; THUMB6-LABEL: negative_scalar_i8_bitsinmiddle_slt:
@@ -1033,14 +1028,8 @@ define i1 @negative_scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
 ; THUMB6-NEXT:    movs r2, #24
 ; THUMB6-NEXT:    lsls r2, r1
 ; THUMB6-NEXT:    ands r2, r0
-; THUMB6-NEXT:    sxtb r0, r2
-; THUMB6-NEXT:    cmp r0, #0
-; THUMB6-NEXT:    bmi .LBB20_2
-; THUMB6-NEXT:  @ %bb.1:
-; THUMB6-NEXT:    movs r0, #0
-; THUMB6-NEXT:    bx lr
-; THUMB6-NEXT:  .LBB20_2:
-; THUMB6-NEXT:    movs r0, #1
+; THUMB6-NEXT:    lsls r0, r2, #24
+; THUMB6-NEXT:    lsrs r0, r0, #31
 ; THUMB6-NEXT:    bx lr
 ;
 ; THUMB78-LABEL: negative_scalar_i8_bitsinmiddle_slt:
@@ -1049,11 +1038,7 @@ define i1 @negative_scalar_i8_bitsinmiddle_slt(i8 %x, i8 %y) nounwind {
 ; THUMB78-NEXT:    movs r2, #24
 ; THUMB78-NEXT:    lsl.w r1, r2, r1
 ; THUMB78-NEXT:    ands r0, r1
-; THUMB78-NEXT:    sxtb r1, r0
-; THUMB78-NEXT:    movs r0, #0
-; THUMB78-NEXT:    cmp r1, #0
-; THUMB78-NEXT:    it mi
-; THUMB78-NEXT:    movmi r0, #1
+; THUMB78-NEXT:    ubfx r0, r0, #7, #1
 ; THUMB78-NEXT:    bx lr
   %t0 = shl i8 24, %y
   %t1 = and i8 %t0, %x
diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/tiny.ll b/llvm/test/CodeGen/AVR/calling-conv/c/tiny.ll
index ab1035d086021..7257eab8e12a7 100644
--- a/llvm/test/CodeGen/AVR/calling-conv/c/tiny.ll
+++ b/llvm/test/CodeGen/AVR/calling-conv/c/tiny.ll
@@ -46,7 +46,7 @@ define i16 @foo3(i16 %a, i16 %b, i16 %c, i16 %d) {
 ; CHECK-NEXT:    push r28
 ; CHECK-NEXT:    push r29
 ; CHECK-NEXT:    in r28, 61
-; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    clr r29
 ; CHECK-NEXT:    in r16, 63
 ; CHECK-NEXT:    subi r28, 251
 ; CHECK-NEXT:    sbci r29, 255
@@ -79,7 +79,7 @@ define i32 @foo4(i32 %a, i32 %b) {
 ; CHECK-NEXT:    push r28
 ; CHECK-NEXT:    push r29
 ; CHECK-NEXT:    in r28, 61
-; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    clr r29
 ; CHECK-NEXT:    in r16, 63
 ; CHECK-NEXT:    subi r28, 251
 ; CHECK-NEXT:    sbci r29, 255
@@ -122,7 +122,7 @@ define i8 @foo5([5 x i8] %0, i8 %1) {
 ; CHECK-NEXT:    push r28
 ; CHECK-NEXT:    push r29
 ; CHECK-NEXT:    in r28, 61
-; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    clr r29
 ; CHECK-NEXT:    mov r26, r28
 ; CHECK-NEXT:    mov r27, r29
 ; CHECK-NEXT:    subi r26, 251
@@ -157,7 +157,7 @@ define i8 @foo7([3 x i8] %0, [3 x i8] %1) {
 ; CHECK-NEXT:    push r28
 ; CHECK-NEXT:    push r29
 ; CHECK-NEXT:    in r28, 61
-; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    clr r29
 ; CHECK-NEXT:    mov r26, r28
 ; CHECK-NEXT:    mov r27, r29
 ; CHECK-NEXT:    subi r26, 251
@@ -181,7 +181,7 @@ define i8 @foo8([3 x i8] %0, i8 %1, i8 %2) {
 ; CHECK-NEXT:    push r28
 ; CHECK-NEXT:    push r29
 ; CHECK-NEXT:    in r28, 61
-; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    clr r29
 ; CHECK-NEXT:    add r22, r20
 ; CHECK-NEXT:    mov r26, r28
 ; CHECK-NEXT:    mov r27, r29
@@ -205,7 +205,7 @@ define i8 @foo9([7 x i8] %0) {
 ; CHECK-NEXT:    push r28
 ; CHECK-NEXT:    push r29
 ; CHECK-NEXT:    in r28, 61
-; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    clr r29
 ; CHECK-NEXT:    mov r26, r28
 ; CHECK-NEXT:    mov r27, r29
 ; CHECK-NEXT:    subi r26, 250
@@ -233,7 +233,7 @@ define i8 @fooa([6 x i8] %0, i8 %1, i8 %2) {
 ; CHECK-NEXT:    push r28
 ; CHECK-NEXT:    push r29
 ; CHECK-NEXT:    in r28, 61
-; CHECK-NEXT:    in r29, 62
+; CHECK-NEXT:    clr r29
 ; CHECK-NEXT:    mov r26, r28
 ; CHECK-NEXT:    mov r27, r29
 ; CHECK-NEXT:    subi r26, 251
diff --git a/llvm/test/CodeGen/AVR/pseudo/SPREAD.mir b/llvm/test/CodeGen/AVR/pseudo/SPREAD.mir
new file mode 100644
index 0000000000000..dfa6358d76679
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/pseudo/SPREAD.mir
@@ -0,0 +1,28 @@
+# RUN: llc -O0 -run-pass=avr-expand-pseudo -mtriple=avr -mcpu=attiny13 %s -o - \
+# RUN:     | FileCheck --check-prefix=NOSPH %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo -mtriple=avr -mcpu=atmega328 %s -o - \
+# RUN:     | FileCheck %s
+
+--- |
+  target triple = "avr--"
+  define void @test() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test
+body: |
+  bb.0.entry:
+
+    ; CHECK-LABEL: test
+    ; CHECK:       $r14 = INRdA 61
+    ; CHECK:       $r15 = INRdA 62
+
+    ; NOSPH-LABEL: test
+    ; NOSPH:       $r14 = INRdA 61
+    ; NOSPH:       $r15 = EORRdRr killed $r15, $r15, implicit-def dead $sreg
+
+     $r15r14 = SPREAD $sp, implicit $sp
+...
diff --git a/llvm/test/CodeGen/AVR/pseudo/SPWRITE.mir b/llvm/test/CodeGen/AVR/pseudo/SPWRITE.mir
index ed6e39c641b11..8be3589b564bb 100644
--- a/llvm/test/CodeGen/AVR/pseudo/SPWRITE.mir
+++ b/llvm/test/CodeGen/AVR/pseudo/SPWRITE.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo -mtriple=avr -mcpu=attiny11 %s -o - \
+# RUN: llc -O0 -run-pass=avr-expand-pseudo -mtriple=avr -mcpu=attiny13 %s -o - \
 # RUN:     | FileCheck --check-prefix=NOSPH %s
 # RUN: llc -O0 -run-pass=avr-expand-pseudo -mtriple=avr -mcpu=atmega328 %s -o - \
 # RUN:     | FileCheck %s
@@ -34,14 +34,11 @@ body: |
     ; CHECK:       OUTARr 61, $r14
 
     ; NOSPH-LABEL: test
-    ; NOSPH:       $r0 = INRdA 63
-    ; NOSPH:       BCLRs 7, implicit-def $sreg
-    ; NOSPH:       OUTARr 63, killed $r0
     ; NOSPH:       OUTARr 61, $r14
 
     ; XMEGA-LABEL: test
     ; XMEGA-LABEL: OUTARr 61, $r14
     ; XMEGA-LABEL: OUTARr 62, $r15
 
-    $sp = SPWRITE implicit-def $sp, implicit $sp, $r15r14
+    $sp = SPWRITE implicit-def $sp, $r15r14, implicit $sp
 ...
diff --git a/llvm/test/CodeGen/AVR/return.ll b/llvm/test/CodeGen/AVR/return.ll
index 93dfa257c4b33..455cf75a7159c 100644
--- a/llvm/test/CodeGen/AVR/return.ll
+++ b/llvm/test/CodeGen/AVR/return.ll
@@ -135,7 +135,7 @@ define i32 @return32_arg2(i32 %x, i32 %y, i32 %z) {
 ; TINY-NEXT:    push r28
 ; TINY-NEXT:    push r29
 ; TINY-NEXT:    in r28, 61
-; TINY-NEXT:    in r29, 62
+; TINY-NEXT:    clr r29
 ; TINY-NEXT:    in r16, 63
 ; TINY-NEXT:    subi r28, 247
 ; TINY-NEXT:    sbci r29, 255
@@ -221,7 +221,7 @@ define i64 @return64_arg(i64 %x) {
 ; TINY-NEXT:    push r28
 ; TINY-NEXT:    push r29
 ; TINY-NEXT:    in r28, 61
-; TINY-NEXT:    in r29, 62
+; TINY-NEXT:    clr r29
 ; TINY-NEXT:    in r16, 63
 ; TINY-NEXT:    subi r28, 245
 ; TINY-NEXT:    sbci r29, 255
@@ -314,7 +314,7 @@ define i64 @return64_arg2(i64 %x, i64 %y, i64 %z) {
 ; TINY-NEXT:    push r28
 ; TINY-NEXT:    push r29
 ; TINY-NEXT:    in r28, 61
-; TINY-NEXT:    in r29, 62
+; TINY-NEXT:    clr r29
 ; TINY-NEXT:    in r16, 63
 ; TINY-NEXT:    subi r28, 229
 ; TINY-NEXT:    sbci r29, 255
@@ -403,7 +403,7 @@ define i32 @return64_trunc(i32 %a, i32 %b, i32 %c, i64 %d) {
 ; TINY-NEXT:    push r28
 ; TINY-NEXT:    push r29
 ; TINY-NEXT:    in r28, 61
-; TINY-NEXT:    in r29, 62
+; TINY-NEXT:    clr r29
 ; TINY-NEXT:    in r16, 63
 ; TINY-NEXT:    subi r28, 243
 ; TINY-NEXT:    sbci r29, 255
diff --git a/llvm/test/CodeGen/BPF/cttz-ctlz.ll b/llvm/test/CodeGen/BPF/cttz-ctlz.ll
index 865833590a399..0c0561c28ef7b 100644
--- a/llvm/test/CodeGen/BPF/cttz-ctlz.ll
+++ b/llvm/test/CodeGen/BPF/cttz-ctlz.ll
@@ -31,7 +31,7 @@ define i32 @cttz_i32(i32 %a) {
 ; CHECK-NEXT:    r2 = r1
 ; CHECK-NEXT:    r2 <<= 32
 ; CHECK-NEXT:    r2 >>= 32
-; CHECK-NEXT:    if r2 == 0 goto LBB1_2
+; CHECK-NEXT:    if r2 == 0 goto .LBB1_2
 ; CHECK-NEXT:  # %bb.1: # %cond.false
 ; CHECK-NEXT:    r2 = r1
 ; CHECK-NEXT:    r2 = -r2
@@ -43,7 +43,7 @@ define i32 @cttz_i32(i32 %a) {
 ; CHECK-NEXT:    r2 = {{\.?LCPI[0-9]+_[0-9]+}} ll
 ; CHECK-NEXT:    r2 += r1
 ; CHECK-NEXT:    r0 = *(u8 *)(r2 + 0)
-; CHECK-NEXT:  LBB1_2: # %cond.end
+; CHECK-NEXT:  .LBB1_2: # %cond.end
 ; CHECK-NEXT:    exit
     %ret = call i32 @llvm.cttz.i32(i32 %a, i1 0)
     ret i32 %ret
@@ -73,7 +73,7 @@ define i64 @cttz_i64(i64 %a) {
 ; CHECK-LABEL: cttz_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    r0 = 64
-; CHECK-NEXT:    if r1 == 0 goto LBB3_2
+; CHECK-NEXT:    if r1 == 0 goto .LBB3_2
 ; CHECK-NEXT:  # %bb.1: # %cond.false
 ; CHECK-NEXT:    r2 = r1
 ; CHECK-NEXT:    r2 = -r2
@@ -84,7 +84,7 @@ define i64 @cttz_i64(i64 %a) {
 ; CHECK-NEXT:    r2 = {{\.?LCPI[0-9]+_[0-9]+}} ll
 ; CHECK-NEXT:    r2 += r1
 ; CHECK-NEXT:    r0 = *(u8 *)(r2 + 0)
-; CHECK-NEXT:  LBB3_2: # %cond.end
+; CHECK-NEXT:  .LBB3_2: # %cond.end
 ; CHECK-NEXT:    exit
     %ret = call i64 @llvm.cttz.i64(i64 %a, i1 0)
     ret i64 %ret
@@ -152,7 +152,7 @@ define i32 @ctlz_i32(i32 %a) {
 ; CHECK-NEXT:    r2 = r1
 ; CHECK-NEXT:    r2 <<= 32
 ; CHECK-NEXT:    r2 >>= 32
-; CHECK-NEXT:    if r2 == 0 goto LBB5_2
+; CHECK-NEXT:    if r2 == 0 goto .LBB5_2
 ; CHECK-NEXT:  # %bb.1: # %cond.false
 ; CHECK-NEXT:    r2 = 4294967294 ll
 ; CHECK-NEXT:    r3 = r1
@@ -197,7 +197,7 @@ define i32 @ctlz_i32(i32 %a) {
 ; CHECK-NEXT:    r1 = 4278190080 ll
 ; CHECK-NEXT:    r0 &= r1
 ; CHECK-NEXT:    r0 >>= 24
-; CHECK-NEXT:  LBB5_2: # %cond.end
+; CHECK-NEXT:  .LBB5_2: # %cond.end
 ; CHECK-NEXT:    exit
     %ret = call i32 @llvm.ctlz.i32(i32 %a, i1 0)
     ret i32 %ret
@@ -256,7 +256,7 @@ define i64 @ctlz_i64(i64 %a) {
 ; CHECK-LABEL: ctlz_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    r0 = 64
-; CHECK-NEXT:    if r1 == 0 goto LBB7_2
+; CHECK-NEXT:    if r1 == 0 goto .LBB7_2
 ; CHECK-NEXT:  # %bb.1: # %cond.false
 ; CHECK-NEXT:    r2 = r1
 ; CHECK-NEXT:    r2 >>= 1
@@ -296,7 +296,7 @@ define i64 @ctlz_i64(i64 %a) {
 ; CHECK-NEXT:    r1 = 72340172838076673 ll
 ; CHECK-NEXT:    r0 *= r1
 ; CHECK-NEXT:    r0 >>= 56
-; CHECK-NEXT:  LBB7_2: # %cond.end
+; CHECK-NEXT:  .LBB7_2: # %cond.end
 ; CHECK-NEXT:    exit
     %ret = call i64 @llvm.ctlz.i64(i64 %a, i1 0)
     ret i64 %ret
diff --git a/llvm/test/CodeGen/BPF/gotol.ll b/llvm/test/CodeGen/BPF/gotol.ll
index 4df6192d0c8ca..eceb6671207a4 100644
--- a/llvm/test/CodeGen/BPF/gotol.ll
+++ b/llvm/test/CodeGen/BPF/gotol.ll
@@ -38,8 +38,8 @@ entry:
 ; case (3): conditional jmp followed by an unconditional jmp
 ; CHECK:        w0 = w2
 ; CHECK-NEXT:   if w1 < w2 goto
-; CHECK:        gotol LBB0_4    # encoding: [0x06'A',A,A,A,0x00,0x00,0x00,0x00]
-; CHECK-NEXT:                   # fixup A - offset: 0, value: LBB0_4, kind: FK_BPF_PCRel_4
+; CHECK:        gotol .LBB0_4   # encoding: [0x06'A',A,A,A,0x00,0x00,0x00,0x00]
+; CHECK-NEXT:                   # fixup A - offset: 0, value: .LBB0_4, kind: FK_BPF_PCRel_4
 
 begin:                                            ; preds = %next2, %next
   %s.0 = phi i32 [ %mul3, %next ], [ %mul7, %next2 ]
@@ -49,12 +49,12 @@ begin:                                            ; preds = %next2, %next
 
 ; case (2): conditional jmp
 ; CHECK:        w0 *= w1
-; CHECK-NEXT:   if w0 > w2 goto LBB0_7
-; CHECK:        goto LBB0_4
-; CHECK-LABEL:  LBB0_7:
+; CHECK-NEXT:   if w0 > w2 goto .LBB0_7
+; CHECK:        goto .LBB0_4
+; CHECK-LABEL:  .LBB0_7:
 ; CHECK:        gotol
 
-; CHECK-LABEL:  LBB0_4:
+; CHECK-LABEL:  .LBB0_4:
 
 if.then2:                                         ; preds = %begin
   %mul = mul i32 %div, %div
diff --git a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
index b7d518639d70e..fabbad13b28c3 100644
--- a/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_blockaddr.ll
@@ -62,21 +62,21 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	r2 = *(u64 *)(r2 + 0)
 ; CHECK: 	r3 = BPF.JT.0.1 ll
 ; CHECK: 	r3 = *(u64 *)(r3 + 0)
-; CHECK: 	if w1 == 0 goto LBB0_2
+; CHECK: 	if w1 == 0 goto .LBB0_2
 ; CHECK: # %bb.1:                                # %entry
 ; CHECK: 	r3 = r2
-; CHECK: LBB0_2:                                 # %entry
+; CHECK: .LBB0_2:                                # %entry
 ; CHECK: 	*(u64 *)(r10 - 8) = r3
 ; CHECK: 	r1 = *(u64 *)(r10 - 8)
 ; CHECK: 	gotox r1
 ; CHECK: .Ltmp0:                                 # Block address taken
-; CHECK: LBB0_3:                                 # %l1
+; CHECK: .LBB0_3:                                # %l1
 ; CHECK: 	w0 = 3
-; CHECK: 	goto LBB0_5
+; CHECK: 	goto .LBB0_5
 ; CHECK: .Ltmp1:                                 # Block address taken
-; CHECK: LBB0_4:                                 # %l2
+; CHECK: .LBB0_4:                                # %l2
 ; CHECK: 	w0 = 2
-; CHECK: LBB0_5:                                 # %.split
+; CHECK: .LBB0_5:                                # %.split
 ; CHECK: 	exit
 ; CHECK: .Lfunc_end0:
 ; CHECK: 	.size	bar, .Lfunc_end0-bar
@@ -84,8 +84,8 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"", at progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_3-.text
+; CHECK: 	.quad	.LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 8
 ; CHECK: BPF.JT.0.1:
-; CHECK: 	.quad	LBB0_4-.text
+; CHECK: 	.quad	.LBB0_4-.text
 ; CHECK: 	.size	BPF.JT.0.1, 8
diff --git a/llvm/test/CodeGen/BPF/jump_table_global_var.ll b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
index 71c682f5530ed..2f60b343bef51 100644
--- a/llvm/test/CodeGen/BPF/jump_table_global_var.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_global_var.ll
@@ -64,13 +64,13 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	r1 = *(u64 *)(r2 + 0)
 ; CHECK: 	gotox r1
 ; CHECK: .Ltmp0:                                 # Block address taken
-; CHECK: LBB0_1:                                 # %l1
+; CHECK: .LBB0_1:                                # %l1
 ; CHECK: 	w0 = 4
-; CHECK: 	goto LBB0_3
+; CHECK: 	goto .LBB0_3
 ; CHECK: .Ltmp1:                                 # Block address taken
-; CHECK: LBB0_2:                                 # %l2
+; CHECK: .LBB0_2:                                # %l2
 ; CHECK: 	w0 = 3
-; CHECK: LBB0_3:                                 # %.split
+; CHECK: .LBB0_3:                                # %.split
 ; CHECK: 	exit
 ; CHECK: .Lfunc_end0:
 ; CHECK: 	.size	foo, .Lfunc_end0-foo
@@ -78,6 +78,6 @@ llc -march=bpf -mcpu=v4 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"", at progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_1-.text
-; CHECK: 	.quad	LBB0_2-.text
+; CHECK: 	.quad	.LBB0_1-.text
+; CHECK: 	.quad	.LBB0_2-.text
 ; CHECK: 	.size	BPF.JT.0.0, 16
diff --git a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
index eb1e5bff11013..9f8cdb492aba0 100644
--- a/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
+++ b/llvm/test/CodeGen/BPF/jump_table_switch_stmt.ll
@@ -66,7 +66,7 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
 ; CHECK: # %bb.0:                                # %entry
 ; CHECK:                                         # kill: def $w1 killed $w1 def $r1
 ; CHECK: 	w1 += -1
-; CHECK: 	if w1 > 29 goto LBB0_5
+; CHECK: 	if w1 > 29 goto .LBB0_5
 ; CHECK: # %bb.1:                                # %entry
 ; CHECK: 	w2 = 18
 ; CHECK: 	r1 <<= 3
@@ -76,15 +76,15 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
 ; CHECK: 	r1 = *(u64 *)(r4 + 0)
 ; CHECK: 	r3 += r1
 ; CHECK: 	gotox r3
-; CHECK: LBB0_2:                                 # %sw.bb1
+; CHECK: .LBB0_2:                                # %sw.bb1
 ; CHECK: 	w2 = 6
-; CHECK: 	goto LBB0_4
-; CHECK: LBB0_3:                                 # %sw.bb2
+; CHECK: 	goto .LBB0_4
+; CHECK: .LBB0_3:                                # %sw.bb2
 ; CHECK: 	w2 = 2
-; CHECK: LBB0_4:                                 # %sw.epilog.sink.split
+; CHECK: .LBB0_4:                                # %sw.epilog.sink.split
 ; CHECK: 	r1 = ret_user ll
 ; CHECK: 	*(u32 *)(r1 + 0) = w2
-; CHECK: LBB0_5:                                 # %sw.epilog
+; CHECK: .LBB0_5:                                # %sw.epilog
 ; CHECK: 	w0 = 0
 ; CHECK: 	exit
 ; CHECK: .Lfunc_end0:
@@ -93,34 +93,34 @@ llc -march=bpf -mcpu=v4 -bpf-min-jump-table-entries=3 < test.ll \
 ; CHECK: 	.cfi_endproc
 ; CHECK: 	.section	.jumptables,"", at progbits
 ; CHECK: BPF.JT.0.0:
-; CHECK: 	.quad	LBB0_4-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_2-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_5-.text
-; CHECK: 	.quad	LBB0_3-.text
+; CHECK: 	.quad	.LBB0_4-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_2-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_5-.text
+; CHECK: 	.quad	.LBB0_3-.text
 ; CHECK: 	.size	BPF.JT.0.0, 240
diff --git a/llvm/test/CodeGen/BPF/remove_truncate_9.ll b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
index 5ea55ef81d650..2662aa66c7d4a 100644
--- a/llvm/test/CodeGen/BPF/remove_truncate_9.ll
+++ b/llvm/test/CodeGen/BPF/remove_truncate_9.ll
@@ -15,10 +15,10 @@ define void @shl_lshr_same_bb(ptr %p) {
 ; CHECK-V2:       # %bb.0: # %entry
 ; CHECK-V2-NEXT:    r1 = *(u8 *)(r1 + 0)
 ; CHECK-V2-NEXT:    r5 = 1
-; CHECK-V2-NEXT:    if r1 == 0 goto LBB0_2
+; CHECK-V2-NEXT:    if r1 == 0 goto .LBB0_2
 ; CHECK-V2-NEXT:  # %bb.1: # %entry
 ; CHECK-V2-NEXT:    r5 = 0
-; CHECK-V2-NEXT:  LBB0_2: # %entry
+; CHECK-V2-NEXT:  .LBB0_2: # %entry
 ; CHECK-V2-NEXT:    r3 = r1
 ; CHECK-V2-NEXT:    r3 <<= 56
 ; CHECK-V2-NEXT:    r2 = r1
@@ -30,10 +30,10 @@ define void @shl_lshr_same_bb(ptr %p) {
 ; CHECK-V4:       # %bb.0: # %entry
 ; CHECK-V4-NEXT:    w1 = *(u8 *)(r1 + 0)
 ; CHECK-V4-NEXT:    w5 = 1
-; CHECK-V4-NEXT:    if w1 == 0 goto LBB0_2
+; CHECK-V4-NEXT:    if w1 == 0 goto .LBB0_2
 ; CHECK-V4-NEXT:  # %bb.1: # %entry
 ; CHECK-V4-NEXT:    w5 = 0
-; CHECK-V4-NEXT:  LBB0_2: # %entry
+; CHECK-V4-NEXT:  .LBB0_2: # %entry
 ; CHECK-V4-NEXT:    r3 = r1
 ; CHECK-V4-NEXT:    r3 <<= 56
 ; CHECK-V4-NEXT:    r2 = r1
@@ -57,10 +57,10 @@ define void @shl_lshr_diff_bb(ptr %p) {
 ; CHECK-V2:       # %bb.0: # %entry
 ; CHECK-V2-NEXT:    r1 = *(u16 *)(r1 + 0)
 ; CHECK-V2-NEXT:    r5 = 1
-; CHECK-V2-NEXT:    if r1 == 0 goto LBB1_2
+; CHECK-V2-NEXT:    if r1 == 0 goto .LBB1_2
 ; CHECK-V2-NEXT:  # %bb.1: # %entry
 ; CHECK-V2-NEXT:    r5 = 0
-; CHECK-V2-NEXT:  LBB1_2: # %entry
+; CHECK-V2-NEXT:  .LBB1_2: # %entry
 ; CHECK-V2-NEXT:    r3 = r1
 ; CHECK-V2-NEXT:    r3 <<= 48
 ; CHECK-V2-NEXT:    r2 = r1
@@ -72,10 +72,10 @@ define void @shl_lshr_diff_bb(ptr %p) {
 ; CHECK-V4:       # %bb.0: # %entry
 ; CHECK-V4-NEXT:    w1 = *(u16 *)(r1 + 0)
 ; CHECK-V4-NEXT:    w5 = 1
-; CHECK-V4-NEXT:    if w1 == 0 goto LBB1_2
+; CHECK-V4-NEXT:    if w1 == 0 goto .LBB1_2
 ; CHECK-V4-NEXT:  # %bb.1: # %entry
 ; CHECK-V4-NEXT:    w5 = 0
-; CHECK-V4-NEXT:  LBB1_2: # %entry
+; CHECK-V4-NEXT:  .LBB1_2: # %entry
 ; CHECK-V4-NEXT:    r3 = r1
 ; CHECK-V4-NEXT:    r3 <<= 48
 ; CHECK-V4-NEXT:    r2 = r1
@@ -105,10 +105,10 @@ define void @load_zext_same_bb(ptr %p) {
 ; CHECK-V2:       # %bb.0: # %entry
 ; CHECK-V2-NEXT:    r1 = *(u8 *)(r1 + 0)
 ; CHECK-V2-NEXT:    r2 = 1
-; CHECK-V2-NEXT:    if r1 == 0 goto LBB2_2
+; CHECK-V2-NEXT:    if r1 == 0 goto .LBB2_2
 ; CHECK-V2-NEXT:  # %bb.1: # %entry
 ; CHECK-V2-NEXT:    r2 = 0
-; CHECK-V2-NEXT:  LBB2_2: # %entry
+; CHECK-V2-NEXT:  .LBB2_2: # %entry
 ; CHECK-V2-NEXT:    call sink3
 ; CHECK-V2-NEXT:    exit
 ;
@@ -116,10 +116,10 @@ define void @load_zext_same_bb(ptr %p) {
 ; CHECK-V4:       # %bb.0: # %entry
 ; CHECK-V4-NEXT:    w1 = *(u8 *)(r1 + 0)
 ; CHECK-V4-NEXT:    w2 = 1
-; CHECK-V4-NEXT:    if w1 == 0 goto LBB2_2
+; CHECK-V4-NEXT:    if w1 == 0 goto .LBB2_2
 ; CHECK-V4-NEXT:  # %bb.1: # %entry
 ; CHECK-V4-NEXT:    w2 = 0
-; CHECK-V4-NEXT:  LBB2_2: # %entry
+; CHECK-V4-NEXT:  .LBB2_2: # %entry
 ; CHECK-V4-NEXT:    call sink3
 ; CHECK-V4-NEXT:    exit
 entry:
@@ -135,10 +135,10 @@ define void @load_zext_diff_bb(ptr %p) {
 ; CHECK-V2:       # %bb.0: # %entry
 ; CHECK-V2-NEXT:    r1 = *(u8 *)(r1 + 0)
 ; CHECK-V2-NEXT:    r2 = 1
-; CHECK-V2-NEXT:    if r1 == 0 goto LBB3_2
+; CHECK-V2-NEXT:    if r1 == 0 goto .LBB3_2
 ; CHECK-V2-NEXT:  # %bb.1: # %next
 ; CHECK-V2-NEXT:    r2 = 0
-; CHECK-V2-NEXT:  LBB3_2: # %next
+; CHECK-V2-NEXT:  .LBB3_2: # %next
 ; CHECK-V2-NEXT:    call sink3
 ; CHECK-V2-NEXT:    exit
 ;
@@ -146,10 +146,10 @@ define void @load_zext_diff_bb(ptr %p) {
 ; CHECK-V4:       # %bb.0: # %entry
 ; CHECK-V4-NEXT:    w1 = *(u8 *)(r1 + 0)
 ; CHECK-V4-NEXT:    w2 = 1
-; CHECK-V4-NEXT:    if w1 == 0 goto LBB3_2
+; CHECK-V4-NEXT:    if w1 == 0 goto .LBB3_2
 ; CHECK-V4-NEXT:  # %bb.1: # %next
 ; CHECK-V4-NEXT:    w2 = 0
-; CHECK-V4-NEXT:  LBB3_2: # %next
+; CHECK-V4-NEXT:  .LBB3_2: # %next
 ; CHECK-V4-NEXT:    call sink3
 ; CHECK-V4-NEXT:    exit
 entry:
@@ -167,10 +167,10 @@ define void @load_zext_diff_bb_2(ptr %p) {
 ; CHECK-V2:       # %bb.0: # %entry
 ; CHECK-V2-NEXT:    r1 = *(u32 *)(r1 + 0)
 ; CHECK-V2-NEXT:    r2 = 1
-; CHECK-V2-NEXT:    if r1 == 0 goto LBB4_2
+; CHECK-V2-NEXT:    if r1 == 0 goto .LBB4_2
 ; CHECK-V2-NEXT:  # %bb.1: # %next
 ; CHECK-V2-NEXT:    r2 = 0
-; CHECK-V2-NEXT:  LBB4_2: # %next
+; CHECK-V2-NEXT:  .LBB4_2: # %next
 ; CHECK-V2-NEXT:    call sink4
 ; CHECK-V2-NEXT:    exit
 ;
@@ -178,10 +178,10 @@ define void @load_zext_diff_bb_2(ptr %p) {
 ; CHECK-V4:       # %bb.0: # %entry
 ; CHECK-V4-NEXT:    w1 = *(u32 *)(r1 + 0)
 ; CHECK-V4-NEXT:    w2 = 1
-; CHECK-V4-NEXT:    if w1 == 0 goto LBB4_2
+; CHECK-V4-NEXT:    if w1 == 0 goto .LBB4_2
 ; CHECK-V4-NEXT:  # %bb.1: # %next
 ; CHECK-V4-NEXT:    w2 = 0
-; CHECK-V4-NEXT:  LBB4_2: # %next
+; CHECK-V4-NEXT:  .LBB4_2: # %next
 ; CHECK-V4-NEXT:    call sink4
 ; CHECK-V4-NEXT:    exit
 entry:
diff --git a/llvm/test/CodeGen/BPF/sanity.ll b/llvm/test/CodeGen/BPF/sanity.ll
index 0a97a812bbb3f..6d12062705e04 100644
--- a/llvm/test/CodeGen/BPF/sanity.ll
+++ b/llvm/test/CodeGen/BPF/sanity.ll
@@ -76,10 +76,10 @@ define signext i8 @foo_cmp(i8 signext %a, i8 signext %b) #0 {
 ; CHECK-LABEL: foo_cmp:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    r0 = r1
-; CHECK-NEXT:    if r2 s> r0 goto LBB5_2
+; CHECK-NEXT:    if r2 s> r0 goto .LBB5_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    r0 = r2
-; CHECK-NEXT:  LBB5_2:
+; CHECK-NEXT:  .LBB5_2:
 ; CHECK-NEXT:    exit
   %1 = icmp slt i8 %a, %b
   %a.b = select i1 %1, i8 %a, i8 %b
@@ -91,18 +91,18 @@ define i32 @foo_muldiv(i8 signext %a, i16 signext %b, i32 %c, i64 %d) #0 {
 ; CHECK-LABEL: foo_muldiv:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    r0 = r2
-; CHECK-NEXT:    if r1 == 0 goto LBB6_2
+; CHECK-NEXT:    if r1 == 0 goto .LBB6_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    r0 *= r3
-; CHECK-NEXT:    goto LBB6_3
-; CHECK-NEXT:  LBB6_2:
+; CHECK-NEXT:    goto .LBB6_3
+; CHECK-NEXT:  .LBB6_2:
 ; CHECK-NEXT:    r3 <<= 32
 ; CHECK-NEXT:    r3 >>= 32
 ; CHECK-NEXT:    r4 <<= 32
 ; CHECK-NEXT:    r4 >>= 32
 ; CHECK-NEXT:    r4 /= r3
 ; CHECK-NEXT:    r0 = r4
-; CHECK-NEXT:  LBB6_3:
+; CHECK-NEXT:  .LBB6_3:
 ; CHECK-NEXT:    exit
   %1 = icmp eq i8 %a, 0
   br i1 %1, label %5, label %2
diff --git a/llvm/test/CodeGen/DirectX/Binding/binding-overlap-6.ll b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-6.ll
index a58e85b4159f1..dcca4feb31dae 100644
--- a/llvm/test/CodeGen/DirectX/Binding/binding-overlap-6.ll
+++ b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-6.ll
@@ -18,7 +18,7 @@ target triple = "dxil-pc-shadermodel6.3-library"
 define void @test_overlapping() {
 entry:
   %h1 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 0, i32 3, i32 0, ptr @A.str)
-  %h2 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 4, i32 -1, i32 0, ptr @B.str)
+  %h2 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 4, i32 0, i32 0, ptr @B.str)
   %h3 = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.resource.handlefrombinding(i32 0, i32 17, i32 1, i32 0, ptr @C.str)
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/Binding/binding-overlap-7.ll b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-7.ll
index 9c52d6ed3486a..151c770489826 100644
--- a/llvm/test/CodeGen/DirectX/Binding/binding-overlap-7.ll
+++ b/llvm/test/CodeGen/DirectX/Binding/binding-overlap-7.ll
@@ -29,7 +29,7 @@ entry:
 
   ; Buffer<double> C[] : register(t2, space4);
   %h2 = call target("dx.TypedBuffer", double, 0, 0, 0)
-            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 -1, i32 10, ptr @C.str)
+            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 0, i32 10, ptr @C.str)
 
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
index d792078b8cbb7..ea3a02245c1cc 100644
--- a/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
+++ b/llvm/test/CodeGen/DirectX/ContainerData/PSVResources.ll
@@ -105,7 +105,7 @@ define void @main() #0 {
 ; CHECK:            UsedByAtomic64:  false
   ; RWBuffer<float4> Buf = BufferArray[100];
   %uav3 = call target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
-              @llvm.dx.resource.handlefrombinding(i32 5, i32 10, i32 -1, i32 100, ptr null)
+              @llvm.dx.resource.handlefrombinding(i32 5, i32 10, i32 0, i32 100, ptr null)
   ret void
 }
 
diff --git a/llvm/test/CodeGen/DirectX/CreateHandle.ll b/llvm/test/CodeGen/DirectX/CreateHandle.ll
index 6cca501bb2568..d92f4d369ad94 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandle.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandle.ll
@@ -61,7 +61,7 @@ define void @test_buffers() {
   %typed3_ix = call i32 @some_val()
   %typed3 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
       @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0t(
-          i32 0, i32 7, i32 -1, i32 %typed3_ix, ptr null)
+          i32 0, i32 7, i32 0, i32 %typed3_ix, ptr null)
   ; CHECK: %[[IX:.*]] = add i32 %typed3_ix, 7
   ; CHECK: call %dx.types.Handle @dx.op.createHandle(i32 57, i8 0, i32 0, i32 %[[IX]], i1 false) #[[#ATTR]]
 
diff --git a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
index 78f6a9a8049f3..9a6fea7a85b92 100644
--- a/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
+++ b/llvm/test/CodeGen/DirectX/CreateHandleFromBinding.ll
@@ -66,7 +66,7 @@ define void @test_bindings() {
   %typed3_ix = call i32 @some_val()
   %typed3 = call target("dx.TypedBuffer", <4 x float>, 0, 0, 0)
       @llvm.dx.resource.handlefrombinding.tdx.TypedBuffer_v4f32_0_0_0t(
-          i32 0, i32 7, i32 -1, i32 %typed3_ix, ptr null)
+          i32 0, i32 7, i32 0, i32 %typed3_ix, ptr null)
   ; CHECK: %[[IX:.*]] = add i32 %typed3_ix, 7
   ; CHECK: [[BUF5:%.*]] = call %dx.types.Handle @dx.op.createHandleFromBinding(i32 217, %dx.types.ResBind { i32 7, i32 -1, i32 0, i8 0 }, i32 %[[IX]], i1 false) #[[#ATTR]]
   ; CHECK: call %dx.types.Handle @dx.op.annotateHandle(i32 216, %dx.types.Handle [[BUF5]], %dx.types.ResourceProperties { i32 10, i32 1033 }) #[[#ATTR]]
diff --git a/llvm/test/CodeGen/DirectX/Metadata/srv_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/srv_metadata.ll
index 0062f90326490..ea8f418d4dada 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/srv_metadata.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/srv_metadata.ll
@@ -80,9 +80,9 @@ define void @test() #0 {
   ; Buffer<double> C1 = Array[10];
   ; Buffer<double> C2 = Array[20];
   %Array2_10_h = call target("dx.TypedBuffer", double, 0, 0, 0)
-            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 -1, i32 10, ptr @Array2.str)
+            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 0, i32 10, ptr @Array2.str)
   %Array2_20_h = call target("dx.TypedBuffer", double, 0, 0, 0)
-            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 -1, i32 20, ptr @Array2.str)
+            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 0, i32 20, ptr @Array2.str)
 
   ret void
 }
diff --git a/llvm/test/CodeGen/DirectX/Metadata/uav_metadata.ll b/llvm/test/CodeGen/DirectX/Metadata/uav_metadata.ll
index d377a528abca1..0f7d56fb1261e 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/uav_metadata.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/uav_metadata.ll
@@ -92,9 +92,9 @@ define void @test() #0 {
   ; RWBuffer<double> C1 = Array[10];
   ; RWBuffer<double> C2 = Array[20];
   %Array2_10_h = call target("dx.TypedBuffer", double, 1, 0, 0)
-            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 -1, i32 10, ptr @Array2.str)
+            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 0, i32 10, ptr @Array2.str)
   %Array2_20_h = call target("dx.TypedBuffer", double, 1, 0, 0)
-            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 -1, i32 20, ptr @Array2.str)
+            @llvm.dx.resource.handlefrombinding(i32 4, i32 2, i32 0, i32 20, ptr @Array2.str)
 
   ; Same buffer type as Nine - should have the same type in metadata
   ; RWBuffer<double> Ten : register(u2);
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index 262ee06c6f732..969826be6fff7 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -70,6 +70,9 @@
 ; LAXX-NEXT:       Expand reduction intrinsics
 ; LAXX-NEXT:       Natural Loop Information
 ; LAXX-NEXT:       Type Promotion
+; LAXX-NEXT:       Post-Dominator Tree Construction
+; LAXX-NEXT:       Branch Probability Analysis
+; LAXX-NEXT:       Block Frequency Analysis
 ; LAXX-NEXT:       CodeGen Prepare
 ; LAXX-NEXT:       Dominator Tree Construction
 ; LAXX-NEXT:       Exception handling preparation
diff --git a/llvm/test/CodeGen/MIR/Generic/inline-asm-extra-info.mir b/llvm/test/CodeGen/MIR/Generic/inline-asm-extra-info.mir
new file mode 100644
index 0000000000000..3857198769f68
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/inline-asm-extra-info.mir
@@ -0,0 +1,82 @@
+# RUN: llc -run-pass=none -print-symbolic-inline-asm-ops -o - %s | FileCheck --match-full-lines %s
+
+---
+name: test_attdialect
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_attdialect
+    ; CHECK: INLINEASM &"", attdialect
+    INLINEASM &"", attdialect
+...
+
+---
+name: test_sideeffect
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_sideeffect
+    ; CHECK: INLINEASM &"", sideeffect attdialect
+    INLINEASM &"", sideeffect
+...
+
+---
+name: test_alignstack
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_alignstack
+    ; CHECK: INLINEASM &"", alignstack attdialect
+    INLINEASM &"", alignstack
+...
+
+---
+name: test_inteldialect
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_inteldialect
+    ; CHECK: INLINEASM &"", inteldialect
+    INLINEASM &"", inteldialect
+...
+
+---
+name: test_mayload
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_mayload
+    ; CHECK: INLINEASM &"", mayload attdialect
+    INLINEASM &"", mayload
+...
+
+---
+name: test_maystore
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_maystore
+    ; CHECK: INLINEASM &"", maystore attdialect
+    INLINEASM &"", maystore
+...
+
+---
+name: test_isconvergent
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_isconvergent
+    ; CHECK: INLINEASM &"", isconvergent attdialect
+    INLINEASM &"", isconvergent
+...
+
+---
+name: test_unwind
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_unwind
+    ; CHECK: INLINEASM &"", unwind attdialect
+    INLINEASM &"", unwind
+...
+
+---
+name: test_combined
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: test_combined
+    ; CHECK: INLINEASM &"", sideeffect mayload inteldialect
+    INLINEASM &"", sideeffect mayload inteldialect
+...
diff --git a/llvm/test/CodeGen/MIR/Generic/inline-asm-unknown-kind.mir b/llvm/test/CodeGen/MIR/Generic/inline-asm-unknown-kind.mir
new file mode 100644
index 0000000000000..7603dd23c44ad
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/inline-asm-unknown-kind.mir
@@ -0,0 +1,11 @@
+# RUN: not llc -run-pass=none -o - %s 2>&1 | FileCheck %s
+
+# Test error handling for unknown inline asm operand kind.
+
+---
+name: test_unknown_operand_kind
+body: |
+  bb.0:
+    ; CHECK: [[@LINE+1]]:32: unknown inline asm operand kind 'badkind'
+    INLINEASM &"", attdialect, badkind
+...
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-i8.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-i8.ll
index 327f7f7308057..d8c0860426b16 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-mma-i8.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-i8.ll
@@ -26,7 +26,7 @@ define void @tcgen05_mma_i8_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %aten
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_i8_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::i8.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::i8.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::i8.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -66,7 +66,7 @@ define void @tcgen05_mma_sp_i8_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %a
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_i8_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::i8.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
@@ -105,7 +105,7 @@ define void @tcgen05_mma_i8_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %aten
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_i8_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::i8.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::i8.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::i8.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -145,7 +145,7 @@ define void @tcgen05_mma_sp_i8_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %a
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_i8_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::i8.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::i8.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll
index c22b718a19a0d..cc30a26c75c50 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma-scale-d.ll
@@ -23,8 +23,8 @@ define void @tcgen05_mma_fp16_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %ate
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_fp16_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0;
@@ -73,8 +73,8 @@ define void @tcgen05_mma_fp16_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %ate
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_fp16_cg2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0;
@@ -125,8 +125,8 @@ define void @tcgen05_mma_sp_fp16_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.ashift.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
@@ -176,8 +176,8 @@ define void @tcgen05_mma_sp_fp16_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cg2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.ashift.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
@@ -227,8 +227,8 @@ define void @tcgen05_mma_tf32_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %ate
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_tf32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0;
@@ -277,8 +277,8 @@ define void @tcgen05_mma_tf32_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %ate
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_tf32_cg2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse.ashift [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1, 0;
@@ -329,8 +329,8 @@ define void @tcgen05_mma_sp_tf32_cg1(ptr addrspace(6) %dtmem, ptr addrspace(6) %
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cg1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.ashift.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
@@ -380,8 +380,8 @@ define void @tcgen05_mma_sp_tf32_cg2(ptr addrspace(6) %dtmem, ptr addrspace(6) %
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cg2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.ashift.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1, 0;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1, 0;
diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll b/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll
index fcde161bf1ab1..25bda658ce26b 100644
--- a/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll
+++ b/llvm/test/CodeGen/NVPTX/tcgen05-mma.ll
@@ -29,7 +29,7 @@ define void @tcgen05_mma_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %at
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_fp16_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -69,7 +69,7 @@ define void @tcgen05_mma_sp_fp16_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6)
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
@@ -108,7 +108,7 @@ define void @tcgen05_mma_tf32_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %at
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_tf32_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -148,7 +148,7 @@ define void @tcgen05_mma_sp_tf32_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6)
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
@@ -187,7 +187,7 @@ define void @tcgen05_mma_f8f6f4_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6) %
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_f8f6f4_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -227,7 +227,7 @@ define void @tcgen05_mma_sp_f8f6fr_cta1(ptr addrspace(6) %dtmem, ptr addrspace(6
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_f8f6fr_cta1_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::1.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
@@ -266,7 +266,7 @@ define void @tcgen05_mma_fp16_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %at
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_fp16_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -306,7 +306,7 @@ define void @tcgen05_mma_sp_fp16_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6)
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_fp16_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f16.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
@@ -345,7 +345,7 @@ define void @tcgen05_mma_tf32_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %at
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_tf32_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -385,7 +385,7 @@ define void @tcgen05_mma_sp_tf32_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6)
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_tf32_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::tf32.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
@@ -424,7 +424,7 @@ define void @tcgen05_mma_f8f6f4_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6) %
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r3, [tcgen05_mma_f8f6f4_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r3], %rd2, %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r3], %rd2, %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, %r2, %p1;
@@ -464,7 +464,7 @@ define void @tcgen05_mma_sp_f8f6fr_cta2(ptr addrspace(6) %dtmem, ptr addrspace(6
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    ld.param.b32 %r4, [tcgen05_mma_sp_f8f6fr_cta2_param_1];
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
-; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::discard.ashift [%r1], [%r4], %rd2, [%r3], %r2, %p1;
+; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f8f6f4.ashift.collector::a::discard [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], %rd1, %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::lastuse [%r1], [%r4], %rd2, [%r3], %r2, %p1;
 ; CHECK-NEXT:    tcgen05.mma.sp.cta_group::2.kind::f8f6f4.collector::a::fill [%r1], %rd1, %rd2, [%r3], %r2, %p1;
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index f771b5728e5b5..9fad0145ccda4 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -70,6 +70,9 @@
 ; CHECK-NEXT:       Scalarize Masked Memory Intrinsics
 ; CHECK-NEXT:       Expand reduction intrinsics
 ; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Post-Dominator Tree Construction
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
diff --git a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
index ebdc4390bac28..8a829887678c4 100644
--- a/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/dmf-outer-product.ll
@@ -11,9 +11,8 @@ declare <1024 x i1> @llvm.ppc.mma.dmxvi8gerx4(<256 x i1>, <16 x i8>)
 define void @test_dmxvi8gerx4(ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-LABEL: test_dmxvi8gerx4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    lxv vs0, 0(r4)
-; CHECK-NEXT:    lxv v3, 0(r3)
 ; CHECK-NEXT:    dmxvi8gerx4 dmr0, vsp34, vs0
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp34, 96(r5)
@@ -25,9 +24,8 @@ define void @test_dmxvi8gerx4(ptr %vpp, ptr %vcp, ptr %resp) {
 ;
 ; CHECK-BE-LABEL: test_dmxvi8gerx4:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
 ; CHECK-BE-NEXT:    dmxvi8gerx4 dmr0, vsp34, vs0
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
@@ -53,8 +51,7 @@ define void @test_dmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -73,8 +70,7 @@ define void @test_dmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -104,8 +100,7 @@ define void @test_dmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -124,8 +119,7 @@ define void @test_dmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -155,8 +149,7 @@ define void @test_pmdmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -175,8 +168,7 @@ define void @test_pmdmxvi8gerx4pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -202,9 +194,8 @@ declare <1024 x i1> @llvm.ppc.mma.pmdmxvi8gerx4(<256 x i1>, <16 x i8>, i32, i32,
 define void @test_pmdmxvi8gerx4(ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-LABEL: test_pmdmxvi8gerx4:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    lxv vs0, 0(r4)
-; CHECK-NEXT:    lxv v3, 0(r3)
 ; CHECK-NEXT:    pmdmxvi8gerx4 dmr0, vsp34, vs0, 55, 5, 10
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp34, 96(r5)
@@ -216,9 +207,8 @@ define void @test_pmdmxvi8gerx4(ptr %vpp, ptr %vcp, ptr %resp) {
 ;
 ; CHECK-BE-LABEL: test_pmdmxvi8gerx4:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
 ; CHECK-BE-NEXT:    pmdmxvi8gerx4 dmr0, vsp34, vs0, 55, 5, 10
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
@@ -244,8 +234,7 @@ define dso_local void @test_pmdmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -264,8 +253,7 @@ define dso_local void @test_pmdmxvi8gerx4spp(ptr %vop, ptr %vpp, ptr %vcp, ptr %
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -291,9 +279,8 @@ declare <1024 x i1> @llvm.ppc.mma.dmxvbf16gerx2(<256 x i1>, <16 x i8>)
 define void @test_dmxvbf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-LABEL: test_dmxvbf16gerx2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    lxv vs0, 0(r4)
-; CHECK-NEXT:    lxv v3, 0(r3)
 ; CHECK-NEXT:    dmxvbf16gerx2 dmr0, vsp34, vs0
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp34, 96(r5)
@@ -305,9 +292,8 @@ define void @test_dmxvbf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ;
 ; CHECK-BE-LABEL: test_dmxvbf16gerx2:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
 ; CHECK-BE-NEXT:    dmxvbf16gerx2 dmr0, vsp34, vs0
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
@@ -333,8 +319,7 @@ define void @test_dmxvbf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -353,8 +338,7 @@ define void @test_dmxvbf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -384,8 +368,7 @@ define void @test_dmxvbf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -404,8 +387,7 @@ define void @test_dmxvbf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -435,8 +417,7 @@ define void @test_dmxvbf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -455,8 +436,7 @@ define void @test_dmxvbf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -486,8 +466,7 @@ define void @test_dmxvbf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -506,8 +485,7 @@ define void @test_dmxvbf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -533,9 +511,8 @@ declare <1024 x i1> @llvm.ppc.mma.pmdmxvbf16gerx2(<256 x i1>, <16 x i8>, i32, i3
 define void @test_pmdmxvbf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-LABEL: test_pmdmxvbf16gerx2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    lxv vs0, 0(r4)
-; CHECK-NEXT:    lxv v3, 0(r3)
 ; CHECK-NEXT:    pmdmxvbf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp34, 96(r5)
@@ -547,9 +524,8 @@ define void @test_pmdmxvbf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ;
 ; CHECK-BE-LABEL: test_pmdmxvbf16gerx2:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
 ; CHECK-BE-NEXT:    pmdmxvbf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
@@ -575,8 +551,7 @@ define void @test_pmdmxvbf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -595,8 +570,7 @@ define void @test_pmdmxvbf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -626,8 +600,7 @@ define void @test_pmdmxvbf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -646,8 +619,7 @@ define void @test_pmdmxvbf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -677,8 +649,7 @@ define void @test_pmdmxvbf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -697,8 +668,7 @@ define void @test_pmdmxvbf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -728,8 +698,7 @@ define void @test_pmdmxvbf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -748,8 +717,7 @@ define void @test_pmdmxvbf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -774,9 +742,8 @@ declare <1024 x i1> @llvm.ppc.mma.dmxvf16gerx2(<256 x i1>, <16 x i8>)
 define void @test_dmxvf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-LABEL: test_dmxvf16gerx2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    lxv vs0, 0(r4)
-; CHECK-NEXT:    lxv v3, 0(r3)
 ; CHECK-NEXT:    dmxvf16gerx2 dmr0, vsp34, vs0
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp34, 96(r5)
@@ -788,9 +755,8 @@ define void @test_dmxvf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ;
 ; CHECK-BE-LABEL: test_dmxvf16gerx2:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
 ; CHECK-BE-NEXT:    dmxvf16gerx2 dmr0, vsp34, vs0
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
@@ -816,8 +782,7 @@ define void @test_dmxvf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -836,8 +801,7 @@ define void @test_dmxvf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -867,8 +831,7 @@ define void @test_dmxvf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -887,8 +850,7 @@ define void @test_dmxvf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -918,8 +880,7 @@ define void @test_dmxvf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -938,8 +899,7 @@ define void @test_dmxvf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -969,8 +929,7 @@ define void @test_dmxvf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -989,8 +948,7 @@ define void @test_dmxvf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1016,9 +974,8 @@ declare <1024 x i1> @llvm.ppc.mma.pmdmxvf16gerx2(<256 x i1>, <16 x i8>, i32, i32
 define void @test_pmdmxvf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-LABEL: test_pmdmxvf16gerx2:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    lxv vs0, 0(r4)
-; CHECK-NEXT:    lxv v3, 0(r3)
 ; CHECK-NEXT:    pmdmxvf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp34, 96(r5)
@@ -1030,9 +987,8 @@ define void @test_pmdmxvf16gerx2(ptr %vpp, ptr %vcp, ptr %resp) {
 ;
 ; CHECK-BE-LABEL: test_pmdmxvf16gerx2:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
 ; CHECK-BE-NEXT:    pmdmxvf16gerx2 dmr0, vsp34, vs0, 33, 5, 2
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp36, 96(r5)
@@ -1058,8 +1014,7 @@ define void @test_pmdmxvf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1078,8 +1033,7 @@ define void @test_pmdmxvf16gerx2pp(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1109,8 +1063,7 @@ define void @test_pmdmxvf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1129,8 +1082,7 @@ define void @test_pmdmxvf16gerx2pn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1160,8 +1112,7 @@ define void @test_pmdmxvf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1180,8 +1131,7 @@ define void @test_pmdmxvf16gerx2np(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1211,8 +1161,7 @@ define void @test_pmdmxvf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -1231,8 +1180,7 @@ define void @test_pmdmxvf16gerx2nn(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) {
 ; CHECK-BE-NEXT:    lxvp vsp36, 64(r3)
 ; CHECK-BE-NEXT:    lxvp vsp32, 32(r3)
 ; CHECK-BE-NEXT:    lxvp vsp38, 0(r3)
-; CHECK-BE-NEXT:    lxv v8, 0(r4)
-; CHECK-BE-NEXT:    lxv v9, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-BE-NEXT:    lxv vs0, 0(r5)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
diff --git a/llvm/test/CodeGen/PowerPC/dmr-copy.ll b/llvm/test/CodeGen/PowerPC/dmr-copy.ll
index 925e130f457eb..babcd8f99f694 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-copy.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-copy.ll
@@ -37,12 +37,9 @@ define void @test_wacc_copy(ptr noundef %vdmrp, ptr noundef %vpp, <16 x i8> noun
 ; CHECK-NEXT:    stxvp vsp34, 160(r31)
 ; CHECK-NEXT:    stxvp vsp36, 128(r31)
 ; CHECK-NEXT:    ld r3, 352(r31)
-; CHECK-NEXT:    lxv v2, 16(r3)
-; CHECK-NEXT:    lxv v3, 0(r3)
-; CHECK-NEXT:    stxv v2, 112(r31)
-; CHECK-NEXT:    stxv v3, 96(r31)
-; CHECK-NEXT:    lxv v2, 112(r31)
-; CHECK-NEXT:    lxv v3, 96(r31)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    stxvp vsp34, 96(r31)
+; CHECK-NEXT:    lxvp vsp34, 96(r31)
 ; CHECK-NEXT:    lxv vs0, 336(r31)
 ; CHECK-NEXT:    dmxvi8gerx4 dmr0, vsp34, vs0
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -96,12 +93,9 @@ define void @test_wacc_copy(ptr noundef %vdmrp, ptr noundef %vpp, <16 x i8> noun
 ; CHECK-BE-NEXT:    stxvp vsp36, 160(r31)
 ; CHECK-BE-NEXT:    stxvp vsp34, 128(r31)
 ; CHECK-BE-NEXT:    ld r3, 352(r31)
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
-; CHECK-BE-NEXT:    stxv v3, 112(r31)
-; CHECK-BE-NEXT:    stxv v2, 96(r31)
-; CHECK-BE-NEXT:    lxv v2, 96(r31)
-; CHECK-BE-NEXT:    lxv v3, 112(r31)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-BE-NEXT:    stxvp vsp34, 96(r31)
+; CHECK-BE-NEXT:    lxvp vsp34, 96(r31)
 ; CHECK-BE-NEXT:    lxv vs0, 336(r31)
 ; CHECK-BE-NEXT:    dmxvi8gerx4 dmr0, vsp34, vs0
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
diff --git a/llvm/test/CodeGen/PowerPC/dmr-enable.ll b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
index 14842e8f71089..43145ae4afcc3 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-enable.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
@@ -134,22 +134,18 @@ define void @text512(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4)  {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    dmsetdmrz dmr0
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
-; CHECK-NEXT:    stxv v2, 16(r4)
-; CHECK-NEXT:    stxv v3, 0(r4)
+; CHECK-NEXT:    stxvp vsp34, 0(r4)
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
-; CHECK-NEXT:    stxv v2, 16(r6)
-; CHECK-NEXT:    stxv v3, 0(r6)
+; CHECK-NEXT:    stxvp vsp34, 0(r6)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: text512:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    dmsetdmrz dmr0
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
-; CHECK-BE-NEXT:    stxv v3, 16(r4)
-; CHECK-BE-NEXT:    stxv v2, 0(r4)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r4)
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
-; CHECK-BE-NEXT:    stxv v3, 16(r6)
-; CHECK-BE-NEXT:    stxv v2, 0(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
 ; CHECK-BE-NEXT:    blr
 entry:
   %z = call <1024 x i1> @llvm.ppc.dmsetdmrz()
@@ -167,34 +163,26 @@ define void @text256(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4)  {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    dmsetdmrz dmr0
 ; CHECK-NEXT:    dmxxextfdmr256 vsp34, dmr0, 0
-; CHECK-NEXT:    stxv v2, 16(r4)
-; CHECK-NEXT:    stxv v3, 0(r4)
+; CHECK-NEXT:    stxvp vsp34, 0(r4)
 ; CHECK-NEXT:    dmxxextfdmr256 vsp34, dmr0, 1
-; CHECK-NEXT:    stxv v2, 16(r5)
-; CHECK-NEXT:    stxv v3, 0(r5)
+; CHECK-NEXT:    stxvp vsp34, 0(r5)
 ; CHECK-NEXT:    dmxxextfdmr256 vsp34, dmr0, 2
-; CHECK-NEXT:    stxv v2, 16(r6)
-; CHECK-NEXT:    stxv v3, 0(r6)
+; CHECK-NEXT:    stxvp vsp34, 0(r6)
 ; CHECK-NEXT:    dmxxextfdmr256 vsp34, dmr0, 3
-; CHECK-NEXT:    stxv v2, 16(r7)
-; CHECK-NEXT:    stxv v3, 0(r7)
+; CHECK-NEXT:    stxvp vsp34, 0(r7)
 ; CHECK-NEXT:    blr
 ;
 ; CHECK-BE-LABEL: text256:
 ; CHECK-BE:       # %bb.0: # %entry
 ; CHECK-BE-NEXT:    dmsetdmrz dmr0
 ; CHECK-BE-NEXT:    dmxxextfdmr256 vsp34, dmr0, 0
-; CHECK-BE-NEXT:    stxv v3, 16(r4)
-; CHECK-BE-NEXT:    stxv v2, 0(r4)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r4)
 ; CHECK-BE-NEXT:    dmxxextfdmr256 vsp34, dmr0, 1
-; CHECK-BE-NEXT:    stxv v3, 16(r5)
-; CHECK-BE-NEXT:    stxv v2, 0(r5)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r5)
 ; CHECK-BE-NEXT:    dmxxextfdmr256 vsp34, dmr0, 2
-; CHECK-BE-NEXT:    stxv v3, 16(r6)
-; CHECK-BE-NEXT:    stxv v2, 0(r6)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r6)
 ; CHECK-BE-NEXT:    dmxxextfdmr256 vsp34, dmr0, 3
-; CHECK-BE-NEXT:    stxv v3, 16(r7)
-; CHECK-BE-NEXT:    stxv v2, 0(r7)
+; CHECK-BE-NEXT:    stxvp vsp34, 0(r7)
 ; CHECK-BE-NEXT:    blr
 entry:
   %z = call <1024 x i1> @llvm.ppc.dmsetdmrz()
@@ -212,10 +200,8 @@ entry:
 define void @tins512(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2)  {
 ; CHECK-LABEL: tins512:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
-; CHECK-NEXT:    lxv v3, 0(r3)
-; CHECK-NEXT:    lxv v4, 16(r4)
-; CHECK-NEXT:    lxv v5, 0(r4)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-NEXT:    lxvp vsp36, 0(r4)
 ; CHECK-NEXT:    dmsetdmrz dmr0
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp34, vsp36, 0
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -224,10 +210,8 @@ define void @tins512(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2)
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-NEXT:    stxvp vsp34, 32(r7)
 ; CHECK-NEXT:    stxvp vsp36, 0(r7)
-; CHECK-NEXT:    lxv v2, 16(r5)
-; CHECK-NEXT:    lxv v4, 16(r6)
-; CHECK-NEXT:    lxv v3, 0(r5)
-; CHECK-NEXT:    lxv v5, 0(r6)
+; CHECK-NEXT:    lxvp vsp34, 0(r5)
+; CHECK-NEXT:    lxvp vsp36, 0(r6)
 ; CHECK-NEXT:    dmxxextfdmr512 vsp32, vsp38, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp32, 96(r8)
 ; CHECK-NEXT:    stxvp vsp38, 64(r8)
@@ -239,10 +223,8 @@ define void @tins512(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2)
 ;
 ; CHECK-BE-LABEL: tins512:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
-; CHECK-BE-NEXT:    lxv v4, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
-; CHECK-BE-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r4)
 ; CHECK-BE-NEXT:    dmsetdmrz dmr0
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp32, vsp38, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp38, 96(r7)
@@ -251,10 +233,8 @@ define void @tins512(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2)
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-BE-NEXT:    stxvp vsp36, 32(r7)
 ; CHECK-BE-NEXT:    stxvp vsp34, 0(r7)
-; CHECK-BE-NEXT:    lxv v2, 0(r5)
-; CHECK-BE-NEXT:    lxv v4, 0(r6)
-; CHECK-BE-NEXT:    lxv v3, 16(r5)
-; CHECK-BE-NEXT:    lxv v5, 16(r6)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r5)
+; CHECK-BE-NEXT:    lxvp vsp36, 0(r6)
 ; CHECK-BE-NEXT:    dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp36, 96(r8)
@@ -279,8 +259,7 @@ entry:
 define void @tins256(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4)  {
 ; CHECK-LABEL: tins256:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
-; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    dmsetdmrz dmr0
 ; CHECK-NEXT:    dmxxinstdmr256 dmr0, vsp34, 0
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -289,8 +268,7 @@ define void @tins256(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2,
 ; CHECK-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
 ; CHECK-NEXT:    stxvp vsp34, 32(r7)
 ; CHECK-NEXT:    stxvp vsp36, 0(r7)
-; CHECK-NEXT:    lxv v2, 16(r4)
-; CHECK-NEXT:    lxv v3, 0(r4)
+; CHECK-NEXT:    lxvp vsp34, 0(r4)
 ; CHECK-NEXT:    dmxxinstdmr256 dmr0, vsp34, 1
 ; CHECK-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc0, 0
 ; CHECK-NEXT:    stxvp vsp36, 96(r8)
@@ -316,8 +294,7 @@ define void @tins256(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2,
 ;
 ; CHECK-BE-LABEL: tins256:
 ; CHECK-BE:       # %bb.0: # %entry
-; CHECK-BE-NEXT:    lxv v2, 0(r3)
-; CHECK-BE-NEXT:    lxv v3, 16(r3)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-BE-NEXT:    dmsetdmrz dmr0
 ; CHECK-BE-NEXT:    dmxxinstdmr256 dmr0, vsp34, 0
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
@@ -326,8 +303,7 @@ define void @tins256(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2,
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-BE-NEXT:    stxvp vsp36, 32(r7)
 ; CHECK-BE-NEXT:    stxvp vsp34, 0(r7)
-; CHECK-BE-NEXT:    lxv v2, 0(r4)
-; CHECK-BE-NEXT:    lxv v3, 16(r4)
+; CHECK-BE-NEXT:    lxvp vsp34, 0(r4)
 ; CHECK-BE-NEXT:    dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
 ; CHECK-BE-NEXT:    stxvp vsp32, 96(r8)
 ; CHECK-BE-NEXT:    stxvp vsp36, 64(r8)
diff --git a/llvm/test/CodeGen/PowerPC/dmr-spill.ll b/llvm/test/CodeGen/PowerPC/dmr-spill.ll
index e1d388354e198..af275070e07fa 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-spill.ll
@@ -23,8 +23,7 @@ define void @spillDMRreg(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) nounwind {
 ; CHECK-NEXT:    lxvp vsp36, 32(r3)
 ; CHECK-NEXT:    lxvp vsp32, 64(r3)
 ; CHECK-NEXT:    lxvp vsp38, 96(r3)
-; CHECK-NEXT:    lxv v8, 16(r4)
-; CHECK-NEXT:    lxv v9, 0(r4)
+; CHECK-NEXT:    lxvp vsp40, 0(r4)
 ; CHECK-NEXT:    lxv vs0, 0(r5)
 ; CHECK-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; CHECK-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -65,8 +64,7 @@ define void @spillDMRreg(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) nounwind {
 ; AIX-NEXT:    lxvp vsp36, 64(r3)
 ; AIX-NEXT:    lxvp vsp32, 32(r3)
 ; AIX-NEXT:    lxvp vsp38, 0(r3)
-; AIX-NEXT:    lxv v8, 0(r4)
-; AIX-NEXT:    lxv v9, 16(r4)
+; AIX-NEXT:    lxvp vsp40, 0(r4)
 ; AIX-NEXT:    lxv vs0, 0(r5)
 ; AIX-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; AIX-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
@@ -108,8 +106,7 @@ define void @spillDMRreg(ptr %vop, ptr %vpp, ptr %vcp, ptr %resp) nounwind {
 ; AIX32-NEXT:    lxvp vsp36, 64(r3)
 ; AIX32-NEXT:    lxvp vsp32, 32(r3)
 ; AIX32-NEXT:    lxvp vsp38, 0(r3)
-; AIX32-NEXT:    lxv v8, 0(r4)
-; AIX32-NEXT:    lxv v9, 16(r4)
+; AIX32-NEXT:    lxvp vsp40, 0(r4)
 ; AIX32-NEXT:    lxv vs0, 0(r5)
 ; AIX32-NEXT:    dmxxinstdmr512 wacc_hi0, vsp36, vsp34, 1
 ; AIX32-NEXT:    dmxxinstdmr512 wacc0, vsp38, vsp32, 0
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
index 94121f09e36be..651759f4fb45a 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-copy-hints.ll
@@ -119,9 +119,8 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound
 ; CHECK-LE-WACC-NEXT:    bl _Z15buildVectorPairPu13__vector_pairDv16_hS0_ at notoc
 ; CHECK-LE-WACC-NEXT:    dmxxsetaccz wacc0
 ; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
-; CHECK-LE-WACC-NEXT:    lxv vs0, 48(r1)
-; CHECK-LE-WACC-NEXT:    lxv vs1, 32(r1)
-; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, vs1, vs0
+; CHECK-LE-WACC-NEXT:    lxvp vsp34, 32(r1)
+; CHECK-LE-WACC-NEXT:    xvf32gerpp wacc0, v3, v2
 ; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp36, vsp34, wacc0, 0
 ; CHECK-LE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp36, vsp34, 0
 ; CHECK-LE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -160,9 +159,8 @@ define void @testMultiply(ptr nocapture noundef readonly %a, ptr nocapture nound
 ; CHECK-BE-WACC-NEXT:    nop
 ; CHECK-BE-WACC-NEXT:    dmxxsetaccz wacc0
 ; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v31, v30
-; CHECK-BE-WACC-NEXT:    lxv vs0, 128(r1)
-; CHECK-BE-WACC-NEXT:    lxv vs1, 144(r1)
-; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, vs0, vs1
+; CHECK-BE-WACC-NEXT:    lxvp vsp34, 128(r1)
+; CHECK-BE-WACC-NEXT:    xvf32gerpp wacc0, v2, v3
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-BE-WACC-NEXT:    vmr v1, v2
 ; CHECK-BE-WACC-NEXT:    vmr v7, v4
diff --git a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
index bc5d5bed36e9b..8d600ea209f19 100644
--- a/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-acc-memops.ll
@@ -490,10 +490,10 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ;
 ; LE-PAIRED-WACC-LABEL: testLdStPair:
 ; LE-PAIRED-WACC:       # %bb.0: # %entry
-; LE-PAIRED-WACC-NEXT:    plxv vs0, g at PCREL+48(0), 1
-; LE-PAIRED-WACC-NEXT:    plxv vs1, g at PCREL+32(0), 1
-; LE-PAIRED-WACC-NEXT:    pstxv vs0, g at PCREL+80(0), 1
-; LE-PAIRED-WACC-NEXT:    pstxv vs1, g at PCREL+64(0), 1
+; LE-PAIRED-WACC-NEXT:    paddi r3, 0, g at PCREL+32, 1
+; LE-PAIRED-WACC-NEXT:    lxvpx vsp34, 0, r3
+; LE-PAIRED-WACC-NEXT:    paddi r3, 0, g at PCREL+64, 1
+; LE-PAIRED-WACC-NEXT:    stxvpx vsp34, 0, r3
 ; LE-PAIRED-WACC-NEXT:    blr
 ;
 ; BE-PAIRED-LABEL: testLdStPair:
@@ -510,10 +510,8 @@ define dso_local void @testLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-WACC:       # %bb.0: # %entry
 ; BE-PAIRED-WACC-NEXT:    addis r3, r2, g at toc@ha
 ; BE-PAIRED-WACC-NEXT:    addi r3, r3, g at toc@l
-; BE-PAIRED-WACC-NEXT:    lxv vs0, 48(r3)
-; BE-PAIRED-WACC-NEXT:    lxv vs1, 32(r3)
-; BE-PAIRED-WACC-NEXT:    stxv vs0, 80(r3)
-; BE-PAIRED-WACC-NEXT:    stxv vs1, 64(r3)
+; BE-PAIRED-WACC-NEXT:    lxvp vsp34, 32(r3)
+; BE-PAIRED-WACC-NEXT:    stxvp vsp34, 64(r3)
 ; BE-PAIRED-WACC-NEXT:    blr
 ;
 ; LE-PWR9-LABEL: testLdStPair:
@@ -589,13 +587,9 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; LE-PAIRED-WACC:       # %bb.0: # %entry
 ; LE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
 ; LE-PAIRED-WACC-NEXT:    paddi r5, 0, g at PCREL, 1
-; LE-PAIRED-WACC-NEXT:    add r6, r5, r3
-; LE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
-; LE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; LE-PAIRED-WACC-NEXT:    lxvpx vsp34, r5, r3
 ; LE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
-; LE-PAIRED-WACC-NEXT:    add r4, r5, r3
-; LE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
-; LE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; LE-PAIRED-WACC-NEXT:    stxvpx vsp34, r5, r3
 ; LE-PAIRED-WACC-NEXT:    blr
 ;
 ; BE-PAIRED-LABEL: testXLdStPair:
@@ -617,13 +611,9 @@ define dso_local void @testXLdStPair(i64 %SrcIdx, i64 %DstIdx) {
 ; BE-PAIRED-WACC-NEXT:    addis r5, r2, g at toc@ha
 ; BE-PAIRED-WACC-NEXT:    sldi r3, r3, 5
 ; BE-PAIRED-WACC-NEXT:    addi r5, r5, g at toc@l
-; BE-PAIRED-WACC-NEXT:    add r6, r5, r3
-; BE-PAIRED-WACC-NEXT:    lxvx vs0, r5, r3
-; BE-PAIRED-WACC-NEXT:    lxv vs1, 16(r6)
+; BE-PAIRED-WACC-NEXT:    lxvpx vsp34, r5, r3
 ; BE-PAIRED-WACC-NEXT:    sldi r3, r4, 5
-; BE-PAIRED-WACC-NEXT:    add r4, r5, r3
-; BE-PAIRED-WACC-NEXT:    stxvx vs0, r5, r3
-; BE-PAIRED-WACC-NEXT:    stxv vs1, 16(r4)
+; BE-PAIRED-WACC-NEXT:    stxvpx vsp34, r5, r3
 ; BE-PAIRED-WACC-NEXT:    blr
 ;
 ; LE-PWR9-LABEL: testXLdStPair:
@@ -702,10 +692,10 @@ define dso_local void @testUnalignedLdStPair() {
 ;
 ; LE-PAIRED-WACC-LABEL: testUnalignedLdStPair:
 ; LE-PAIRED-WACC:       # %bb.0: # %entry
-; LE-PAIRED-WACC-NEXT:    plxv vs0, g at PCREL+27(0), 1
-; LE-PAIRED-WACC-NEXT:    plxv vs1, g at PCREL+11(0), 1
-; LE-PAIRED-WACC-NEXT:    pstxv vs0, g at PCREL+35(0), 1
-; LE-PAIRED-WACC-NEXT:    pstxv vs1, g at PCREL+19(0), 1
+; LE-PAIRED-WACC-NEXT:    paddi r3, 0, g at PCREL+11, 1
+; LE-PAIRED-WACC-NEXT:    lxvpx vsp34, 0, r3
+; LE-PAIRED-WACC-NEXT:    paddi r3, 0, g at PCREL+19, 1
+; LE-PAIRED-WACC-NEXT:    stxvpx vsp34, 0, r3
 ; LE-PAIRED-WACC-NEXT:    blr
 ;
 ; BE-PAIRED-LABEL: testUnalignedLdStPair:
@@ -722,10 +712,8 @@ define dso_local void @testUnalignedLdStPair() {
 ; BE-PAIRED-WACC:       # %bb.0: # %entry
 ; BE-PAIRED-WACC-NEXT:    addis r3, r2, g at toc@ha
 ; BE-PAIRED-WACC-NEXT:    addi r3, r3, g at toc@l
-; BE-PAIRED-WACC-NEXT:    plxv vs0, 27(r3), 0
-; BE-PAIRED-WACC-NEXT:    plxv vs1, 11(r3), 0
-; BE-PAIRED-WACC-NEXT:    pstxv vs0, 35(r3), 0
-; BE-PAIRED-WACC-NEXT:    pstxv vs1, 19(r3), 0
+; BE-PAIRED-WACC-NEXT:    plxvp vsp34, 11(r3), 0
+; BE-PAIRED-WACC-NEXT:    pstxvp vsp34, 19(r3), 0
 ; BE-PAIRED-WACC-NEXT:    blr
 ;
 ; LE-PWR9-LABEL: testUnalignedLdStPair:
diff --git a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
index 35f35706b8690..d0a8a1c58c80e 100644
--- a/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
+++ b/llvm/test/CodeGen/PowerPC/mma-outer-product.ll
@@ -2280,8 +2280,7 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ;
 ; CHECK-WACC-LABEL: test33:
 ; CHECK-WACC:       # %bb.0: # %entry
-; CHECK-WACC-NEXT:    lxv v4, 16(r4)
-; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r4)
 ; CHECK-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-WACC-NEXT:    stxv v4, 48(r7)
@@ -2292,8 +2291,7 @@ define void @test33(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ;
 ; CHECK-BE-WACC-LABEL: test33:
 ; CHECK-BE-WACC:       # %bb.0: # %entry
-; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r4)
 ; CHECK-BE-WACC-NEXT:    xvf64ger wacc0, vsp36, v2
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
@@ -2352,8 +2350,7 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    xvf64gerpp wacc0, vsp38, v2
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2369,8 +2366,7 @@ define void @test34(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    xvf64gerpp wacc0, vsp38, v2
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2431,8 +2427,7 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    xvf64gerpn wacc0, vsp38, v2
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2448,8 +2443,7 @@ define void @test35(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    xvf64gerpn wacc0, vsp38, v2
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2510,8 +2504,7 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    xvf64gernp wacc0, vsp38, v2
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2527,8 +2520,7 @@ define void @test36(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    xvf64gernp wacc0, vsp38, v2
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2589,8 +2581,7 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    xvf64gernn wacc0, vsp38, v2
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2606,8 +2597,7 @@ define void @test37(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    xvf64gernn wacc0, vsp38, v2
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2654,8 +2644,7 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ;
 ; CHECK-WACC-LABEL: test38:
 ; CHECK-WACC:       # %bb.0: # %entry
-; CHECK-WACC-NEXT:    lxv v4, 16(r4)
-; CHECK-WACC-NEXT:    lxv v5, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp36, 0(r4)
 ; CHECK-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-WACC-NEXT:    stxv v4, 48(r7)
@@ -2666,8 +2655,7 @@ define void @test38(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ;
 ; CHECK-BE-WACC-LABEL: test38:
 ; CHECK-BE-WACC:       # %bb.0: # %entry
-; CHECK-BE-WACC-NEXT:    lxv v4, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v5, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp36, 0(r4)
 ; CHECK-BE-WACC-NEXT:    pmxvf64ger wacc0, vsp36, v2, 0, 0
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
 ; CHECK-BE-WACC-NEXT:    stxv v5, 48(r7)
@@ -2726,8 +2714,7 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    pmxvf64gerpp wacc0, vsp38, v2, 0, 0
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2743,8 +2730,7 @@ define void @test39(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    pmxvf64gerpp wacc0, vsp38, v2, 0, 0
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2805,8 +2791,7 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    pmxvf64gerpn wacc0, vsp38, v2, 0, 0
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2822,8 +2807,7 @@ define void @test40(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    pmxvf64gerpn wacc0, vsp38, v2, 0, 0
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2884,8 +2868,7 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    pmxvf64gernp wacc0, vsp38, v2, 0, 0
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2901,8 +2884,7 @@ define void @test41(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    pmxvf64gernp wacc0, vsp38, v2, 0, 0
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2963,8 +2945,7 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-WACC-NEXT:    lxv v1, 32(r3)
 ; CHECK-WACC-NEXT:    lxv v4, 16(r3)
 ; CHECK-WACC-NEXT:    lxv v0, 48(r3)
-; CHECK-WACC-NEXT:    lxv v6, 16(r4)
-; CHECK-WACC-NEXT:    lxv v7, 0(r4)
+; CHECK-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-WACC-NEXT:    pmxvf64gernn wacc0, vsp38, v2, 0, 0
 ; CHECK-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
@@ -2980,8 +2961,7 @@ define void @test42(ptr %vqp, ptr %vpp, <16 x i8> %vc, ptr %resp) {
 ; CHECK-BE-WACC-NEXT:    lxv v1, 16(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v4, 32(r3)
 ; CHECK-BE-WACC-NEXT:    lxv v0, 0(r3)
-; CHECK-BE-WACC-NEXT:    lxv v6, 0(r4)
-; CHECK-BE-WACC-NEXT:    lxv v7, 16(r4)
+; CHECK-BE-WACC-NEXT:    lxvp vsp38, 0(r4)
 ; CHECK-BE-WACC-NEXT:    dmxxinstdmr512 wacc0, vsp32, vsp36, 0
 ; CHECK-BE-WACC-NEXT:    pmxvf64gernn wacc0, vsp38, v2, 0, 0
 ; CHECK-BE-WACC-NEXT:    dmxxextfdmr512 vsp34, vsp36, wacc0, 0
diff --git a/llvm/test/CodeGen/PowerPC/vsx-ldst-with-length.ll b/llvm/test/CodeGen/PowerPC/vsx-ldst-with-length.ll
index e7bc8fbca3202..35aaadc13aba7 100644
--- a/llvm/test/CodeGen/PowerPC/vsx-ldst-with-length.ll
+++ b/llvm/test/CodeGen/PowerPC/vsx-ldst-with-length.ll
@@ -78,15 +78,13 @@ define <256 x i1> @testLXVPRL(ptr %vpp, i64 %b) {
 ; CHECK-LABEL: testLXVPRL:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxvprl vsp34, r4, r5
-; CHECK-NEXT:    stxv v2, 16(r3)
-; CHECK-NEXT:    stxv v3, 0(r3)
+; CHECK-NEXT:    stxvp vsp34, 0(r3)
 ; CHECK-NEXT:    blr
 ;
 ; AIX-LABEL: testLXVPRL:
 ; AIX:       # %bb.0: # %entry
 ; AIX-NEXT:    lxvprl vsp34, r4, r5
-; AIX-NEXT:    stxv v3, 16(r3)
-; AIX-NEXT:    stxv v2, 0(r3)
+; AIX-NEXT:    stxvp vsp34, 0(r3)
 ; AIX-NEXT:    blr
 entry:
   %0 = tail call <256 x i1> @llvm.ppc.vsx.lxvprl(ptr %vpp, i64 %b)
@@ -98,15 +96,13 @@ define <256 x i1> @testLXVPRLL(ptr %vpp, i64 %b) {
 ; CHECK-LABEL: testLXVPRLL:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    lxvprll vsp34, r4, r5
-; CHECK-NEXT:    stxv v2, 16(r3)
-; CHECK-NEXT:    stxv v3, 0(r3)
+; CHECK-NEXT:    stxvp vsp34, 0(r3)
 ; CHECK-NEXT:    blr
 ;
 ; AIX-LABEL: testLXVPRLL:
 ; AIX:       # %bb.0: # %entry
 ; AIX-NEXT:    lxvprll vsp34, r4, r5
-; AIX-NEXT:    stxv v3, 16(r3)
-; AIX-NEXT:    stxv v2, 0(r3)
+; AIX-NEXT:    stxvp vsp34, 0(r3)
 ; AIX-NEXT:    blr
 entry:
   %0 = tail call <256 x i1> @llvm.ppc.vsx.lxvprll(ptr %vpp, i64 %b)
@@ -117,15 +113,13 @@ declare <256 x i1> @llvm.ppc.vsx.lxvprll(ptr, i64)
 define void @testSTXVPRL(ptr %v, ptr %vp, i64 %len) {
 ; CHECK-LABEL: testSTXVPRL:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
-; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    stxvprl vsp34, r4, r5
 ; CHECK-NEXT:    blr
 ;
 ; AIX-LABEL: testSTXVPRL:
 ; AIX:       # %bb.0: # %entry
-; AIX-NEXT:    lxv v2, 0(r3)
-; AIX-NEXT:    lxv v3, 16(r3)
+; AIX-NEXT:    lxvp vsp34, 0(r3)
 ; AIX-NEXT:    stxvprl vsp34, r4, r5
 ; AIX-NEXT:    blr
 entry:
@@ -138,15 +132,13 @@ declare void @llvm.ppc.vsx.stxvprl(<256 x i1>, ptr, i64)
 define void @testSTXVPRLL(ptr %v, ptr %vp, i64 %len) {
 ; CHECK-LABEL: testSTXVPRLL:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    lxv v2, 16(r3)
-; CHECK-NEXT:    lxv v3, 0(r3)
+; CHECK-NEXT:    lxvp vsp34, 0(r3)
 ; CHECK-NEXT:    stxvprll vsp34, r4, r5
 ; CHECK-NEXT:    blr
 ;
 ; AIX-LABEL: testSTXVPRLL:
 ; AIX:       # %bb.0: # %entry
-; AIX-NEXT:    lxv v2, 0(r3)
-; AIX-NEXT:    lxv v3, 16(r3)
+; AIX-NEXT:    lxvp vsp34, 0(r3)
 ; AIX-NEXT:    stxvprll vsp34, r4, r5
 ; AIX-NEXT:    blr
 entry:
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 2b9558e0c69e3..2bbf67b04dff5 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -82,6 +82,9 @@
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Type Promotion
+; CHECK-NEXT:       Post-Dominator Tree Construction
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
index fdc7d98e5d833..4d009ef9ca76b 100644
--- a/llvm/test/CodeGen/RISCV/rv32p.ll
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -186,15 +186,14 @@ define i64 @cls_i64(i64 %x) {
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    xor a0, a0, a2
 ; CHECK-NEXT:    clz a0, a0
-; CHECK-NEXT:    addi a2, a0, 32
+; CHECK-NEXT:    addi a0, a0, 32
 ; CHECK-NEXT:    j .LBB15_3
 ; CHECK-NEXT:  .LBB15_2:
 ; CHECK-NEXT:    xor a1, a1, a2
-; CHECK-NEXT:    clz a2, a1
+; CHECK-NEXT:    clz a0, a1
 ; CHECK-NEXT:  .LBB15_3:
-; CHECK-NEXT:    li a0, -1
-; CHECK-NEXT:    mv a1, a0
-; CHECK-NEXT:    waddau a0, a2, zero
+; CHECK-NEXT:    li a1, 1
+; CHECK-NEXT:    wsubu a0, a0, a1
 ; CHECK-NEXT:    ret
   %a = ashr i64 %x, 63
   %b = xor i64 %x, %a
@@ -1210,3 +1209,36 @@ define i64 @wsubau_zext_chain_rev(i64 %acc, i32 %a, i32 %b) nounwind {
   %sum = add i64 %sub, %ext_b
   ret i64 %sum
 }
+
+define i64 @waddu(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: waddu:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    waddu a0, a0, a1
+; CHECK-NEXT:    ret
+  %ext_a = zext i32 %a to i64
+  %ext_b = zext i32 %b to i64
+  %sum = add i64 %ext_a, %ext_b
+  ret i64 %sum
+}
+
+define i64 @wsubu(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: wsubu:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    wsubu a0, a0, a1
+; CHECK-NEXT:    ret
+  %ext_a = zext i32 %a to i64
+  %ext_b = zext i32 %b to i64
+  %diff = sub i64 %ext_a, %ext_b
+  ret i64 %diff
+}
+
+define i64 @wsub_from_neg_const(i32 %a) nounwind {
+; CHECK-LABEL: wsub_from_neg_const:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 42
+; CHECK-NEXT:    wsubu a0, a0, a1
+; CHECK-NEXT:    ret
+  %ext_a = zext i32 %a to i64
+  %sum = add i64 %ext_a, -42
+  ret i64 %sum
+}
diff --git a/llvm/test/CodeGen/RISCV/rv32zbb.ll b/llvm/test/CodeGen/RISCV/rv32zbb.ll
index b6cfa3c741209..dbd23052318b1 100644
--- a/llvm/test/CodeGen/RISCV/rv32zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zbb.ll
@@ -934,13 +934,77 @@ define i64 @sexth_i64(i64 %a) nounwind {
   ret i64 %shr
 }
 
+define i32 @sextb_extract_i32(i32 %x) {
+; CHECK-LABEL: sextb_extract_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 15
+; CHECK-NEXT:    srai a0, a0, 24
+; CHECK-NEXT:    ret
+  %a = lshr i32 %x, 9
+  %b = trunc i32 %a to i8
+  %c = sext i8 %b to i32
+  ret i32 %c
+}
+
+define i64 @sextb_extract_i64(i64 %x) {
+; RV32I-LABEL: sextb_extract_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 11
+; RV32I-NEXT:    srai a0, a1, 24
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: sextb_extract_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a1, 11
+; RV32ZBB-NEXT:    srai a0, a0, 24
+; RV32ZBB-NEXT:    srai a1, a0, 31
+; RV32ZBB-NEXT:    ret
+  %a = lshr i64 %x, 45
+  %b = trunc i64 %a to i8
+  %c = sext i8 %b to i64
+  ret i64 %c
+}
+
+define i32 @sexth_extract_i32(i32 %x) {
+; CHECK-LABEL: sexth_extract_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    srai a0, a0, 16
+; CHECK-NEXT:    ret
+  %a = lshr i32 %x, 13
+  %b = trunc i32 %a to i16
+  %c = sext i16 %b to i32
+  ret i32 %c
+}
+
+define i64 @sexth_extract_i64(i64 %x) {
+; RV32I-LABEL: sexth_extract_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a1, a1, 11
+; RV32I-NEXT:    srai a0, a1, 16
+; RV32I-NEXT:    srai a1, a1, 31
+; RV32I-NEXT:    ret
+;
+; RV32ZBB-LABEL: sexth_extract_i64:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    slli a0, a1, 11
+; RV32ZBB-NEXT:    srai a0, a0, 16
+; RV32ZBB-NEXT:    srai a1, a0, 31
+; RV32ZBB-NEXT:    ret
+  %a = lshr i64 %x, 37
+  %b = trunc i64 %a to i16
+  %c = sext i16 %b to i64
+  ret i64 %c
+}
+
 define i32 @min_i32(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: min_i32:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    blt a0, a1, .LBB28_2
+; RV32I-NEXT:    blt a0, a1, .LBB32_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB28_2:
+; RV32I-NEXT:  .LBB32_2:
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: min_i32:
@@ -960,18 +1024,18 @@ define i32 @min_i32(i32 %a, i32 %b) nounwind {
 define i64 @min_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: min_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    beq a1, a3, .LBB29_2
+; CHECK-NEXT:    beq a1, a3, .LBB33_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    slt a4, a1, a3
-; CHECK-NEXT:    beqz a4, .LBB29_3
-; CHECK-NEXT:    j .LBB29_4
-; CHECK-NEXT:  .LBB29_2:
+; CHECK-NEXT:    beqz a4, .LBB33_3
+; CHECK-NEXT:    j .LBB33_4
+; CHECK-NEXT:  .LBB33_2:
 ; CHECK-NEXT:    sltu a4, a0, a2
-; CHECK-NEXT:    bnez a4, .LBB29_4
-; CHECK-NEXT:  .LBB29_3:
+; CHECK-NEXT:    bnez a4, .LBB33_4
+; CHECK-NEXT:  .LBB33_3:
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB29_4:
+; CHECK-NEXT:  .LBB33_4:
 ; CHECK-NEXT:    ret
   %cmp = icmp slt i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
@@ -981,10 +1045,10 @@ define i64 @min_i64(i64 %a, i64 %b) nounwind {
 define i32 @max_i32(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: max_i32:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    blt a1, a0, .LBB30_2
+; RV32I-NEXT:    blt a1, a0, .LBB34_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB30_2:
+; RV32I-NEXT:  .LBB34_2:
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: max_i32:
@@ -1004,18 +1068,18 @@ define i32 @max_i32(i32 %a, i32 %b) nounwind {
 define i64 @max_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: max_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    beq a1, a3, .LBB31_2
+; CHECK-NEXT:    beq a1, a3, .LBB35_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    slt a4, a3, a1
-; CHECK-NEXT:    beqz a4, .LBB31_3
-; CHECK-NEXT:    j .LBB31_4
-; CHECK-NEXT:  .LBB31_2:
+; CHECK-NEXT:    beqz a4, .LBB35_3
+; CHECK-NEXT:    j .LBB35_4
+; CHECK-NEXT:  .LBB35_2:
 ; CHECK-NEXT:    sltu a4, a2, a0
-; CHECK-NEXT:    bnez a4, .LBB31_4
-; CHECK-NEXT:  .LBB31_3:
+; CHECK-NEXT:    bnez a4, .LBB35_4
+; CHECK-NEXT:  .LBB35_3:
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB31_4:
+; CHECK-NEXT:  .LBB35_4:
 ; CHECK-NEXT:    ret
   %cmp = icmp sgt i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
@@ -1025,10 +1089,10 @@ define i64 @max_i64(i64 %a, i64 %b) nounwind {
 define i32 @minu_i32(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: minu_i32:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    bltu a0, a1, .LBB32_2
+; RV32I-NEXT:    bltu a0, a1, .LBB36_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB32_2:
+; RV32I-NEXT:  .LBB36_2:
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: minu_i32:
@@ -1048,18 +1112,18 @@ define i32 @minu_i32(i32 %a, i32 %b) nounwind {
 define i64 @minu_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: minu_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    beq a1, a3, .LBB33_2
+; CHECK-NEXT:    beq a1, a3, .LBB37_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sltu a4, a1, a3
-; CHECK-NEXT:    beqz a4, .LBB33_3
-; CHECK-NEXT:    j .LBB33_4
-; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    beqz a4, .LBB37_3
+; CHECK-NEXT:    j .LBB37_4
+; CHECK-NEXT:  .LBB37_2:
 ; CHECK-NEXT:    sltu a4, a0, a2
-; CHECK-NEXT:    bnez a4, .LBB33_4
-; CHECK-NEXT:  .LBB33_3:
+; CHECK-NEXT:    bnez a4, .LBB37_4
+; CHECK-NEXT:  .LBB37_3:
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB33_4:
+; CHECK-NEXT:  .LBB37_4:
 ; CHECK-NEXT:    ret
   %cmp = icmp ult i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
@@ -1069,10 +1133,10 @@ define i64 @minu_i64(i64 %a, i64 %b) nounwind {
 define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
 ; RV32I-LABEL: maxu_i32:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    bltu a1, a0, .LBB34_2
+; RV32I-NEXT:    bltu a1, a0, .LBB38_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a0, a1
-; RV32I-NEXT:  .LBB34_2:
+; RV32I-NEXT:  .LBB38_2:
 ; RV32I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: maxu_i32:
@@ -1092,18 +1156,18 @@ define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
 define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
 ; CHECK-LABEL: maxu_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    beq a1, a3, .LBB35_2
+; CHECK-NEXT:    beq a1, a3, .LBB39_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sltu a4, a3, a1
-; CHECK-NEXT:    beqz a4, .LBB35_3
-; CHECK-NEXT:    j .LBB35_4
-; CHECK-NEXT:  .LBB35_2:
+; CHECK-NEXT:    beqz a4, .LBB39_3
+; CHECK-NEXT:    j .LBB39_4
+; CHECK-NEXT:  .LBB39_2:
 ; CHECK-NEXT:    sltu a4, a2, a0
-; CHECK-NEXT:    bnez a4, .LBB35_4
-; CHECK-NEXT:  .LBB35_3:
+; CHECK-NEXT:    bnez a4, .LBB39_4
+; CHECK-NEXT:  .LBB39_3:
 ; CHECK-NEXT:    mv a0, a2
 ; CHECK-NEXT:    mv a1, a3
-; CHECK-NEXT:  .LBB35_4:
+; CHECK-NEXT:  .LBB39_4:
 ; CHECK-NEXT:    ret
   %cmp = icmp ugt i64 %a, %b
   %cond = select i1 %cmp, i64 %a, i64 %b
@@ -1130,13 +1194,13 @@ define i32 @abs_i32(i32 %x) {
 define i64 @abs_i64(i64 %x) {
 ; CHECK-LABEL: abs_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    bgez a1, .LBB37_2
+; CHECK-NEXT:    bgez a1, .LBB41_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    snez a2, a0
 ; CHECK-NEXT:    neg a0, a0
 ; CHECK-NEXT:    neg a1, a1
 ; CHECK-NEXT:    sub a1, a1, a2
-; CHECK-NEXT:  .LBB37_2:
+; CHECK-NEXT:  .LBB41_2:
 ; CHECK-NEXT:    ret
   %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
   ret i64 %abs
@@ -1430,13 +1494,13 @@ define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
 define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
 ; CHECK-LABEL: sub_if_uge_i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    beq a1, a3, .LBB52_2
+; CHECK-NEXT:    beq a1, a3, .LBB56_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sltu a4, a1, a3
-; CHECK-NEXT:    j .LBB52_3
-; CHECK-NEXT:  .LBB52_2:
+; CHECK-NEXT:    j .LBB56_3
+; CHECK-NEXT:  .LBB56_2:
 ; CHECK-NEXT:    sltu a4, a0, a2
-; CHECK-NEXT:  .LBB52_3:
+; CHECK-NEXT:  .LBB56_3:
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    and a2, a4, a2
@@ -1460,29 +1524,29 @@ define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
 ; CHECK-NEXT:    lw a6, 4(a2)
 ; CHECK-NEXT:    lw t0, 12(a2)
 ; CHECK-NEXT:    lw a7, 8(a2)
-; CHECK-NEXT:    beq a5, t0, .LBB53_2
+; CHECK-NEXT:    beq a5, t0, .LBB57_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sltu t1, a5, t0
-; CHECK-NEXT:    j .LBB53_3
-; CHECK-NEXT:  .LBB53_2:
+; CHECK-NEXT:    j .LBB57_3
+; CHECK-NEXT:  .LBB57_2:
 ; CHECK-NEXT:    sltu t1, a4, a7
-; CHECK-NEXT:  .LBB53_3:
+; CHECK-NEXT:  .LBB57_3:
 ; CHECK-NEXT:    lw a1, 0(a1)
 ; CHECK-NEXT:    lw a2, 0(a2)
-; CHECK-NEXT:    beq a3, a6, .LBB53_5
+; CHECK-NEXT:    beq a3, a6, .LBB57_5
 ; CHECK-NEXT:  # %bb.4:
 ; CHECK-NEXT:    sltu t2, a3, a6
-; CHECK-NEXT:    j .LBB53_6
-; CHECK-NEXT:  .LBB53_5:
+; CHECK-NEXT:    j .LBB57_6
+; CHECK-NEXT:  .LBB57_5:
 ; CHECK-NEXT:    sltu t2, a1, a2
-; CHECK-NEXT:  .LBB53_6:
+; CHECK-NEXT:  .LBB57_6:
 ; CHECK-NEXT:    xor t3, a5, t0
 ; CHECK-NEXT:    xor t4, a4, a7
 ; CHECK-NEXT:    or t3, t4, t3
-; CHECK-NEXT:    beqz t3, .LBB53_8
+; CHECK-NEXT:    beqz t3, .LBB57_8
 ; CHECK-NEXT:  # %bb.7:
 ; CHECK-NEXT:    mv t2, t1
-; CHECK-NEXT:  .LBB53_8:
+; CHECK-NEXT:  .LBB57_8:
 ; CHECK-NEXT:    addi t3, t2, -1
 ; CHECK-NEXT:    and t2, t3, t0
 ; CHECK-NEXT:    and t0, t3, a2
@@ -1490,10 +1554,10 @@ define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
 ; CHECK-NEXT:    sltu a2, a1, t0
 ; CHECK-NEXT:    and a7, t3, a7
 ; CHECK-NEXT:    mv a6, a2
-; CHECK-NEXT:    beq a3, t1, .LBB53_10
+; CHECK-NEXT:    beq a3, t1, .LBB57_10
 ; CHECK-NEXT:  # %bb.9:
 ; CHECK-NEXT:    sltu a6, a3, t1
-; CHECK-NEXT:  .LBB53_10:
+; CHECK-NEXT:  .LBB57_10:
 ; CHECK-NEXT:    sub t3, a4, a7
 ; CHECK-NEXT:    sltu a4, a4, a7
 ; CHECK-NEXT:    sub a5, a5, t2
@@ -1538,12 +1602,12 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
 ; RV32I-NEXT:    addi a2, a2, -1
 ; RV32I-NEXT:    and a2, a2, a1
 ; RV32I-NEXT:    sub a2, a0, a2
-; RV32I-NEXT:    bltu a0, a1, .LBB55_2
+; RV32I-NEXT:    bltu a0, a1, .LBB59_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    li a0, 4
 ; RV32I-NEXT:    sll a0, a2, a0
 ; RV32I-NEXT:    ret
-; RV32I-NEXT:  .LBB55_2:
+; RV32I-NEXT:  .LBB59_2:
 ; RV32I-NEXT:    li a0, 2
 ; RV32I-NEXT:    sll a0, a2, a0
 ; RV32I-NEXT:    ret
@@ -1552,12 +1616,12 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
 ; RV32ZBB:       # %bb.0:
 ; RV32ZBB-NEXT:    sub a2, a0, a1
 ; RV32ZBB-NEXT:    minu a2, a0, a2
-; RV32ZBB-NEXT:    bltu a0, a1, .LBB55_2
+; RV32ZBB-NEXT:    bltu a0, a1, .LBB59_2
 ; RV32ZBB-NEXT:  # %bb.1:
 ; RV32ZBB-NEXT:    li a0, 4
 ; RV32ZBB-NEXT:    sll a0, a2, a0
 ; RV32ZBB-NEXT:    ret
-; RV32ZBB-NEXT:  .LBB55_2:
+; RV32ZBB-NEXT:  .LBB59_2:
 ; RV32ZBB-NEXT:    li a0, 2
 ; RV32ZBB-NEXT:    sll a0, a2, a0
 ; RV32ZBB-NEXT:    ret
@@ -1668,16 +1732,16 @@ define i64 @sub_if_uge_C_i64(i64 %x) {
 ; CHECK-LABEL: sub_if_uge_C_i64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a2, 1
-; CHECK-NEXT:    beq a1, a2, .LBB60_2
+; CHECK-NEXT:    beq a1, a2, .LBB64_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sltiu a2, a1, 2
 ; CHECK-NEXT:    xori a2, a2, 1
-; CHECK-NEXT:    j .LBB60_3
-; CHECK-NEXT:  .LBB60_2:
+; CHECK-NEXT:    j .LBB64_3
+; CHECK-NEXT:  .LBB64_2:
 ; CHECK-NEXT:    lui a2, 172127
 ; CHECK-NEXT:    addi a2, a2, 511
 ; CHECK-NEXT:    sltu a2, a2, a0
-; CHECK-NEXT:  .LBB60_3:
+; CHECK-NEXT:  .LBB64_3:
 ; CHECK-NEXT:    neg a2, a2
 ; CHECK-NEXT:    andi a3, a2, -2
 ; CHECK-NEXT:    add a1, a1, a3
@@ -1737,10 +1801,10 @@ define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
 ; RV32I-NEXT:    add a2, a0, a2
 ; RV32I-NEXT:    addi a3, a3, -16
 ; RV32I-NEXT:    sw a2, 0(a1)
-; RV32I-NEXT:    bltu a3, a0, .LBB62_2
+; RV32I-NEXT:    bltu a3, a0, .LBB66_2
 ; RV32I-NEXT:  # %bb.1:
 ; RV32I-NEXT:    mv a2, a0
-; RV32I-NEXT:  .LBB62_2:
+; RV32I-NEXT:  .LBB66_2:
 ; RV32I-NEXT:    mv a0, a2
 ; RV32I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64zbb.ll
index b3581459c2622..e840605710f21 100644
--- a/llvm/test/CodeGen/RISCV/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64zbb.ll
@@ -1123,13 +1123,61 @@ define i64 @sexth_i64(i64 %a) nounwind {
   ret i64 %shr
 }
 
+define i32 @sextb_extract_i32(i32 %x) {
+; CHECK-LABEL: sextb_extract_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 35
+; CHECK-NEXT:    srai a0, a0, 56
+; CHECK-NEXT:    ret
+  %a = lshr i32 %x, 21
+  %b = trunc i32 %a to i8
+  %c = sext i8 %b to i32
+  ret i32 %c
+}
+
+define i64 @sextb_extract_i64(i64 %x) {
+; CHECK-LABEL: sextb_extract_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 44
+; CHECK-NEXT:    srai a0, a0, 56
+; CHECK-NEXT:    ret
+  %a = lshr i64 %x, 12
+  %b = trunc i64 %a to i8
+  %c = sext i8 %b to i64
+  ret i64 %c
+}
+
+define i32 @sexth_extract_i32(i32 %x) {
+; CHECK-LABEL: sexth_extract_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 40
+; CHECK-NEXT:    srai a0, a0, 48
+; CHECK-NEXT:    ret
+  %a = lshr i32 %x, 8
+  %b = trunc i32 %a to i16
+  %c = sext i16 %b to i32
+  ret i32 %c
+}
+
+define i64 @sexth_extract_i64(i64 %x) {
+; CHECK-LABEL: sexth_extract_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    srai a0, a0, 48
+; CHECK-NEXT:    ret
+  %a = lshr i64 %x, 47
+  %b = trunc i64 %a to i16
+  %c = sext i16 %b to i64
+  ret i64 %c
+}
+
 define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: min_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    blt a0, a1, .LBB36_2
+; RV64I-NEXT:    blt a0, a1, .LBB40_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB36_2:
+; RV64I-NEXT:  .LBB40_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: min_i32:
@@ -1144,10 +1192,10 @@ define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @min_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: min_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    blt a0, a1, .LBB37_2
+; RV64I-NEXT:    blt a0, a1, .LBB41_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB37_2:
+; RV64I-NEXT:  .LBB41_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: min_i64:
@@ -1162,10 +1210,10 @@ define i64 @min_i64(i64 %a, i64 %b) nounwind {
 define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: max_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    blt a1, a0, .LBB38_2
+; RV64I-NEXT:    blt a1, a0, .LBB42_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB38_2:
+; RV64I-NEXT:  .LBB42_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: max_i32:
@@ -1180,10 +1228,10 @@ define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @max_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: max_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    blt a1, a0, .LBB39_2
+; RV64I-NEXT:    blt a1, a0, .LBB43_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB39_2:
+; RV64I-NEXT:  .LBB43_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: max_i64:
@@ -1198,10 +1246,10 @@ define i64 @max_i64(i64 %a, i64 %b) nounwind {
 define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: minu_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    bltu a0, a1, .LBB40_2
+; RV64I-NEXT:    bltu a0, a1, .LBB44_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB40_2:
+; RV64I-NEXT:  .LBB44_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: minu_i32:
@@ -1216,10 +1264,10 @@ define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @minu_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: minu_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    bltu a0, a1, .LBB41_2
+; RV64I-NEXT:    bltu a0, a1, .LBB45_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB41_2:
+; RV64I-NEXT:  .LBB45_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: minu_i64:
@@ -1234,10 +1282,10 @@ define i64 @minu_i64(i64 %a, i64 %b) nounwind {
 define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64I-LABEL: maxu_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    bltu a1, a0, .LBB42_2
+; RV64I-NEXT:    bltu a1, a0, .LBB46_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB42_2:
+; RV64I-NEXT:  .LBB46_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: maxu_i32:
@@ -1252,10 +1300,10 @@ define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
 define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
 ; RV64I-LABEL: maxu_i64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    bltu a1, a0, .LBB43_2
+; RV64I-NEXT:    bltu a1, a0, .LBB47_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a0, a1
-; RV64I-NEXT:  .LBB43_2:
+; RV64I-NEXT:  .LBB47_2:
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: maxu_i64:
@@ -1723,13 +1771,13 @@ define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
 define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
 ; CHECK-LABEL: sub_if_uge_i128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    beq a1, a3, .LBB66_2
+; CHECK-NEXT:    beq a1, a3, .LBB70_2
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    sltu a4, a1, a3
-; CHECK-NEXT:    j .LBB66_3
-; CHECK-NEXT:  .LBB66_2:
+; CHECK-NEXT:    j .LBB70_3
+; CHECK-NEXT:  .LBB70_2:
 ; CHECK-NEXT:    sltu a4, a0, a2
-; CHECK-NEXT:  .LBB66_3:
+; CHECK-NEXT:  .LBB70_3:
 ; CHECK-NEXT:    addi a4, a4, -1
 ; CHECK-NEXT:    and a3, a4, a3
 ; CHECK-NEXT:    and a2, a4, a2
@@ -1771,12 +1819,12 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
 ; RV64I-NEXT:    addi a4, a4, -1
 ; RV64I-NEXT:    and a1, a4, a1
 ; RV64I-NEXT:    sub a0, a0, a1
-; RV64I-NEXT:    bltu a3, a2, .LBB68_2
+; RV64I-NEXT:    bltu a3, a2, .LBB72_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    li a1, 4
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    ret
-; RV64I-NEXT:  .LBB68_2:
+; RV64I-NEXT:  .LBB72_2:
 ; RV64I-NEXT:    li a1, 2
 ; RV64I-NEXT:    sllw a0, a0, a1
 ; RV64I-NEXT:    ret
@@ -1787,12 +1835,12 @@ define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
 ; RV64ZBB-NEXT:    sext.w a3, a0
 ; RV64ZBB-NEXT:    subw a0, a0, a1
 ; RV64ZBB-NEXT:    minu a0, a3, a0
-; RV64ZBB-NEXT:    bltu a3, a2, .LBB68_2
+; RV64ZBB-NEXT:    bltu a3, a2, .LBB72_2
 ; RV64ZBB-NEXT:  # %bb.1:
 ; RV64ZBB-NEXT:    li a1, 4
 ; RV64ZBB-NEXT:    sllw a0, a0, a1
 ; RV64ZBB-NEXT:    ret
-; RV64ZBB-NEXT:  .LBB68_2:
+; RV64ZBB-NEXT:  .LBB72_2:
 ; RV64ZBB-NEXT:    li a1, 2
 ; RV64ZBB-NEXT:    sllw a0, a0, a1
 ; RV64ZBB-NEXT:    ret
@@ -1971,10 +2019,10 @@ define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
 ; RV64I-NEXT:    addw a2, a0, a2
 ; RV64I-NEXT:    addi a3, a3, -16
 ; RV64I-NEXT:    sw a2, 0(a1)
-; RV64I-NEXT:    bltu a3, a0, .LBB75_2
+; RV64I-NEXT:    bltu a3, a0, .LBB79_2
 ; RV64I-NEXT:  # %bb.1:
 ; RV64I-NEXT:    mv a2, a0
-; RV64I-NEXT:  .LBB75_2:
+; RV64I-NEXT:  .LBB79_2:
 ; RV64I-NEXT:    mv a0, a2
 ; RV64I-NEXT:    ret
 ;
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
index 4d41258e6a053..e70eea7263325 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv32.ll
@@ -1115,8 +1115,8 @@ define <2 x i16> @test_psra_hs_vec_shamt(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-RV64-NEXT:    sext.h a2, a0
 ; CHECK-RV64-NEXT:    sra a2, a2, a1
 ; CHECK-RV64-NEXT:    srli a1, a1, 16
-; CHECK-RV64-NEXT:    srli a0, a0, 16
-; CHECK-RV64-NEXT:    sext.h a0, a0
+; CHECK-RV64-NEXT:    slli a0, a0, 32
+; CHECK-RV64-NEXT:    srai a0, a0, 48
 ; CHECK-RV64-NEXT:    sra a0, a0, a1
 ; CHECK-RV64-NEXT:    ppaire.h a0, a2, a0
 ; CHECK-RV64-NEXT:    ret
@@ -1130,15 +1130,15 @@ define <4 x i8> @test_psra_bs_vec_shamt(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-RV32-NEXT:    srli a2, a1, 24
 ; CHECK-RV32-NEXT:    srai a3, a0, 24
 ; CHECK-RV32-NEXT:    srli a4, a1, 8
-; CHECK-RV32-NEXT:    srli a5, a0, 8
+; CHECK-RV32-NEXT:    slli a5, a0, 16
 ; CHECK-RV32-NEXT:    sra a3, a3, a2
-; CHECK-RV32-NEXT:    sext.b a2, a5
-; CHECK-RV32-NEXT:    sra a2, a2, a4
+; CHECK-RV32-NEXT:    srai a5, a5, 24
+; CHECK-RV32-NEXT:    sra a2, a5, a4
 ; CHECK-RV32-NEXT:    sext.b a4, a0
 ; CHECK-RV32-NEXT:    srli a5, a1, 16
-; CHECK-RV32-NEXT:    srli a0, a0, 16
+; CHECK-RV32-NEXT:    slli a0, a0, 8
 ; CHECK-RV32-NEXT:    sra a4, a4, a1
-; CHECK-RV32-NEXT:    sext.b a0, a0
+; CHECK-RV32-NEXT:    srai a0, a0, 24
 ; CHECK-RV32-NEXT:    sra a5, a0, a5
 ; CHECK-RV32-NEXT:    ppaire.db a0, a4, a2
 ; CHECK-RV32-NEXT:    pack a0, a0, a1
@@ -1147,18 +1147,18 @@ define <4 x i8> @test_psra_bs_vec_shamt(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-RV64-LABEL: test_psra_bs_vec_shamt:
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    srli a2, a1, 24
-; CHECK-RV64-NEXT:    srli a3, a0, 24
+; CHECK-RV64-NEXT:    slli a3, a0, 32
 ; CHECK-RV64-NEXT:    srli a4, a1, 16
-; CHECK-RV64-NEXT:    sext.b a3, a3
+; CHECK-RV64-NEXT:    srai a3, a3, 56
 ; CHECK-RV64-NEXT:    sra a2, a3, a2
-; CHECK-RV64-NEXT:    srli a3, a0, 16
-; CHECK-RV64-NEXT:    sext.b a3, a3
+; CHECK-RV64-NEXT:    slli a3, a0, 40
+; CHECK-RV64-NEXT:    srai a3, a3, 56
 ; CHECK-RV64-NEXT:    sra a3, a3, a4
 ; CHECK-RV64-NEXT:    sext.b a4, a0
 ; CHECK-RV64-NEXT:    sra a4, a4, a1
 ; CHECK-RV64-NEXT:    srli a1, a1, 8
-; CHECK-RV64-NEXT:    srli a0, a0, 8
-; CHECK-RV64-NEXT:    sext.b a0, a0
+; CHECK-RV64-NEXT:    slli a0, a0, 48
+; CHECK-RV64-NEXT:    srai a0, a0, 56
 ; CHECK-RV64-NEXT:    sra a0, a0, a1
 ; CHECK-RV64-NEXT:    ppaire.b a1, a3, a2
 ; CHECK-RV64-NEXT:    ppaire.b a0, a4, a0
@@ -1335,11 +1335,11 @@ define <2 x i16> @test_psdiv_h(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    sext.h a2, a1
 ; CHECK-RV64-NEXT:    sext.h a3, a0
-; CHECK-RV64-NEXT:    srli a1, a1, 16
-; CHECK-RV64-NEXT:    srli a0, a0, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 32
+; CHECK-RV64-NEXT:    slli a0, a0, 32
 ; CHECK-RV64-NEXT:    divw a2, a3, a2
-; CHECK-RV64-NEXT:    sext.h a1, a1
-; CHECK-RV64-NEXT:    sext.h a0, a0
+; CHECK-RV64-NEXT:    srai a1, a1, 48
+; CHECK-RV64-NEXT:    srai a0, a0, 48
 ; CHECK-RV64-NEXT:    divw a0, a0, a1
 ; CHECK-RV64-NEXT:    ppaire.h a0, a2, a0
 ; CHECK-RV64-NEXT:    ret
@@ -1352,19 +1352,19 @@ define <4 x i8> @test_psdiv_b(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    srai a2, a1, 24
 ; CHECK-RV32-NEXT:    srai a3, a0, 24
-; CHECK-RV32-NEXT:    srli a4, a1, 8
-; CHECK-RV32-NEXT:    srli a5, a0, 8
+; CHECK-RV32-NEXT:    slli a4, a1, 16
+; CHECK-RV32-NEXT:    slli a5, a0, 16
 ; CHECK-RV32-NEXT:    div a3, a3, a2
-; CHECK-RV32-NEXT:    sext.b a2, a4
-; CHECK-RV32-NEXT:    sext.b a4, a5
-; CHECK-RV32-NEXT:    div a2, a4, a2
+; CHECK-RV32-NEXT:    srai a4, a4, 24
+; CHECK-RV32-NEXT:    srai a5, a5, 24
+; CHECK-RV32-NEXT:    div a2, a5, a4
 ; CHECK-RV32-NEXT:    sext.b a4, a1
 ; CHECK-RV32-NEXT:    sext.b a5, a0
-; CHECK-RV32-NEXT:    srli a1, a1, 16
-; CHECK-RV32-NEXT:    srli a0, a0, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 8
+; CHECK-RV32-NEXT:    slli a0, a0, 8
 ; CHECK-RV32-NEXT:    div a4, a5, a4
-; CHECK-RV32-NEXT:    sext.b a1, a1
-; CHECK-RV32-NEXT:    sext.b a0, a0
+; CHECK-RV32-NEXT:    srai a1, a1, 24
+; CHECK-RV32-NEXT:    srai a0, a0, 24
 ; CHECK-RV32-NEXT:    div a5, a0, a1
 ; CHECK-RV32-NEXT:    ppaire.db a0, a4, a2
 ; CHECK-RV32-NEXT:    pack a0, a0, a1
@@ -1372,23 +1372,23 @@ define <4 x i8> @test_psdiv_b(<4 x i8> %a, <4 x i8> %b) {
 ;
 ; CHECK-RV64-LABEL: test_psdiv_b:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    srli a2, a1, 24
-; CHECK-RV64-NEXT:    srli a3, a0, 24
+; CHECK-RV64-NEXT:    slli a2, a1, 32
+; CHECK-RV64-NEXT:    slli a3, a0, 32
 ; CHECK-RV64-NEXT:    sext.b a4, a1
 ; CHECK-RV64-NEXT:    sext.b a5, a0
 ; CHECK-RV64-NEXT:    divw a4, a5, a4
-; CHECK-RV64-NEXT:    srli a5, a1, 16
-; CHECK-RV64-NEXT:    sext.b a2, a2
-; CHECK-RV64-NEXT:    sext.b a3, a3
+; CHECK-RV64-NEXT:    slli a5, a1, 40
+; CHECK-RV64-NEXT:    srai a2, a2, 56
+; CHECK-RV64-NEXT:    srai a3, a3, 56
 ; CHECK-RV64-NEXT:    divw a2, a3, a2
-; CHECK-RV64-NEXT:    srli a3, a0, 16
-; CHECK-RV64-NEXT:    sext.b a5, a5
-; CHECK-RV64-NEXT:    sext.b a3, a3
+; CHECK-RV64-NEXT:    slli a3, a0, 40
+; CHECK-RV64-NEXT:    srai a5, a5, 56
+; CHECK-RV64-NEXT:    srai a3, a3, 56
 ; CHECK-RV64-NEXT:    divw a3, a3, a5
-; CHECK-RV64-NEXT:    srli a1, a1, 8
-; CHECK-RV64-NEXT:    srli a0, a0, 8
-; CHECK-RV64-NEXT:    sext.b a1, a1
-; CHECK-RV64-NEXT:    sext.b a0, a0
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    slli a0, a0, 48
+; CHECK-RV64-NEXT:    srai a1, a1, 56
+; CHECK-RV64-NEXT:    srai a0, a0, 56
 ; CHECK-RV64-NEXT:    divw a0, a0, a1
 ; CHECK-RV64-NEXT:    ppaire.b a1, a3, a2
 ; CHECK-RV64-NEXT:    ppaire.b a0, a4, a0
@@ -1489,11 +1489,11 @@ define <2 x i16> @test_psrem_h(<2 x i16> %a, <2 x i16> %b) {
 ; CHECK-RV64:       # %bb.0:
 ; CHECK-RV64-NEXT:    sext.h a2, a1
 ; CHECK-RV64-NEXT:    sext.h a3, a0
-; CHECK-RV64-NEXT:    srli a1, a1, 16
-; CHECK-RV64-NEXT:    srli a0, a0, 16
+; CHECK-RV64-NEXT:    slli a1, a1, 32
+; CHECK-RV64-NEXT:    slli a0, a0, 32
 ; CHECK-RV64-NEXT:    remw a2, a3, a2
-; CHECK-RV64-NEXT:    sext.h a1, a1
-; CHECK-RV64-NEXT:    sext.h a0, a0
+; CHECK-RV64-NEXT:    srai a1, a1, 48
+; CHECK-RV64-NEXT:    srai a0, a0, 48
 ; CHECK-RV64-NEXT:    remw a0, a0, a1
 ; CHECK-RV64-NEXT:    ppaire.h a0, a2, a0
 ; CHECK-RV64-NEXT:    ret
@@ -1506,19 +1506,19 @@ define <4 x i8> @test_psrem_b(<4 x i8> %a, <4 x i8> %b) {
 ; CHECK-RV32:       # %bb.0:
 ; CHECK-RV32-NEXT:    srai a2, a1, 24
 ; CHECK-RV32-NEXT:    srai a3, a0, 24
-; CHECK-RV32-NEXT:    srli a4, a1, 8
-; CHECK-RV32-NEXT:    srli a5, a0, 8
+; CHECK-RV32-NEXT:    slli a4, a1, 16
+; CHECK-RV32-NEXT:    slli a5, a0, 16
 ; CHECK-RV32-NEXT:    rem a3, a3, a2
-; CHECK-RV32-NEXT:    sext.b a2, a4
-; CHECK-RV32-NEXT:    sext.b a4, a5
-; CHECK-RV32-NEXT:    rem a2, a4, a2
+; CHECK-RV32-NEXT:    srai a4, a4, 24
+; CHECK-RV32-NEXT:    srai a5, a5, 24
+; CHECK-RV32-NEXT:    rem a2, a5, a4
 ; CHECK-RV32-NEXT:    sext.b a4, a1
 ; CHECK-RV32-NEXT:    sext.b a5, a0
-; CHECK-RV32-NEXT:    srli a1, a1, 16
-; CHECK-RV32-NEXT:    srli a0, a0, 16
+; CHECK-RV32-NEXT:    slli a1, a1, 8
+; CHECK-RV32-NEXT:    slli a0, a0, 8
 ; CHECK-RV32-NEXT:    rem a4, a5, a4
-; CHECK-RV32-NEXT:    sext.b a1, a1
-; CHECK-RV32-NEXT:    sext.b a0, a0
+; CHECK-RV32-NEXT:    srai a1, a1, 24
+; CHECK-RV32-NEXT:    srai a0, a0, 24
 ; CHECK-RV32-NEXT:    rem a5, a0, a1
 ; CHECK-RV32-NEXT:    ppaire.db a0, a4, a2
 ; CHECK-RV32-NEXT:    pack a0, a0, a1
@@ -1526,23 +1526,23 @@ define <4 x i8> @test_psrem_b(<4 x i8> %a, <4 x i8> %b) {
 ;
 ; CHECK-RV64-LABEL: test_psrem_b:
 ; CHECK-RV64:       # %bb.0:
-; CHECK-RV64-NEXT:    srli a2, a1, 24
-; CHECK-RV64-NEXT:    srli a3, a0, 24
+; CHECK-RV64-NEXT:    slli a2, a1, 32
+; CHECK-RV64-NEXT:    slli a3, a0, 32
 ; CHECK-RV64-NEXT:    sext.b a4, a1
 ; CHECK-RV64-NEXT:    sext.b a5, a0
 ; CHECK-RV64-NEXT:    remw a4, a5, a4
-; CHECK-RV64-NEXT:    srli a5, a1, 16
-; CHECK-RV64-NEXT:    sext.b a2, a2
-; CHECK-RV64-NEXT:    sext.b a3, a3
+; CHECK-RV64-NEXT:    slli a5, a1, 40
+; CHECK-RV64-NEXT:    srai a2, a2, 56
+; CHECK-RV64-NEXT:    srai a3, a3, 56
 ; CHECK-RV64-NEXT:    remw a2, a3, a2
-; CHECK-RV64-NEXT:    srli a3, a0, 16
-; CHECK-RV64-NEXT:    sext.b a5, a5
-; CHECK-RV64-NEXT:    sext.b a3, a3
+; CHECK-RV64-NEXT:    slli a3, a0, 40
+; CHECK-RV64-NEXT:    srai a5, a5, 56
+; CHECK-RV64-NEXT:    srai a3, a3, 56
 ; CHECK-RV64-NEXT:    remw a3, a3, a5
-; CHECK-RV64-NEXT:    srli a1, a1, 8
-; CHECK-RV64-NEXT:    srli a0, a0, 8
-; CHECK-RV64-NEXT:    sext.b a1, a1
-; CHECK-RV64-NEXT:    sext.b a0, a0
+; CHECK-RV64-NEXT:    slli a1, a1, 48
+; CHECK-RV64-NEXT:    slli a0, a0, 48
+; CHECK-RV64-NEXT:    srai a1, a1, 56
+; CHECK-RV64-NEXT:    srai a0, a0, 56
 ; CHECK-RV64-NEXT:    remw a0, a0, a1
 ; CHECK-RV64-NEXT:    ppaire.b a1, a3, a2
 ; CHECK-RV64-NEXT:    ppaire.b a0, a4, a0
diff --git a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
index ea28d2d5fe7b4..2d1c6d737a640 100644
--- a/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/rvp-ext-rv64.ll
@@ -1656,19 +1656,19 @@ define <4 x i16> @test_psdiv_h(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srai a2, a1, 48
 ; CHECK-NEXT:    srai a3, a0, 48
-; CHECK-NEXT:    srli a4, a1, 32
+; CHECK-NEXT:    slli a4, a1, 16
 ; CHECK-NEXT:    sext.h a5, a1
 ; CHECK-NEXT:    divw a2, a3, a2
 ; CHECK-NEXT:    sext.h a3, a0
 ; CHECK-NEXT:    divw a3, a3, a5
-; CHECK-NEXT:    srli a5, a0, 32
-; CHECK-NEXT:    sext.h a4, a4
-; CHECK-NEXT:    sext.h a5, a5
+; CHECK-NEXT:    slli a5, a0, 16
+; CHECK-NEXT:    srai a4, a4, 48
+; CHECK-NEXT:    srai a5, a5, 48
 ; CHECK-NEXT:    divw a4, a5, a4
-; CHECK-NEXT:    srli a1, a1, 16
-; CHECK-NEXT:    srli a0, a0, 16
-; CHECK-NEXT:    sext.h a1, a1
-; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    slli a0, a0, 32
+; CHECK-NEXT:    srai a1, a1, 48
+; CHECK-NEXT:    srai a0, a0, 48
 ; CHECK-NEXT:    divw a0, a0, a1
 ; CHECK-NEXT:    ppaire.h a1, a4, a2
 ; CHECK-NEXT:    ppaire.h a0, a3, a0
@@ -1683,39 +1683,39 @@ define <8 x i8> @test_psdiv_b(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srai a2, a1, 56
 ; CHECK-NEXT:    srai a3, a0, 56
-; CHECK-NEXT:    srli a4, a1, 48
-; CHECK-NEXT:    srli a5, a0, 48
-; CHECK-NEXT:    srli a6, a1, 40
-; CHECK-NEXT:    srli a7, a0, 40
-; CHECK-NEXT:    srli t0, a1, 32
+; CHECK-NEXT:    slli a4, a1, 8
+; CHECK-NEXT:    slli a5, a0, 8
+; CHECK-NEXT:    slli a6, a1, 16
+; CHECK-NEXT:    slli a7, a0, 16
+; CHECK-NEXT:    slli t0, a1, 24
 ; CHECK-NEXT:    sext.b t1, a1
 ; CHECK-NEXT:    divw a2, a3, a2
 ; CHECK-NEXT:    sext.b a3, a0
 ; CHECK-NEXT:    divw a3, a3, t1
-; CHECK-NEXT:    srli t1, a0, 32
-; CHECK-NEXT:    sext.b a4, a4
-; CHECK-NEXT:    sext.b a5, a5
+; CHECK-NEXT:    slli t1, a0, 24
+; CHECK-NEXT:    srai a4, a4, 56
+; CHECK-NEXT:    srai a5, a5, 56
 ; CHECK-NEXT:    divw a4, a5, a4
-; CHECK-NEXT:    srli a5, a1, 24
-; CHECK-NEXT:    sext.b a6, a6
-; CHECK-NEXT:    sext.b a7, a7
+; CHECK-NEXT:    slli a5, a1, 32
+; CHECK-NEXT:    srai a6, a6, 56
+; CHECK-NEXT:    srai a7, a7, 56
 ; CHECK-NEXT:    divw a6, a7, a6
-; CHECK-NEXT:    srli a7, a0, 24
-; CHECK-NEXT:    sext.b t0, t0
-; CHECK-NEXT:    sext.b t1, t1
+; CHECK-NEXT:    slli a7, a0, 32
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    srai t1, t1, 56
 ; CHECK-NEXT:    divw t0, t1, t0
-; CHECK-NEXT:    srli t1, a1, 16
-; CHECK-NEXT:    sext.b a5, a5
-; CHECK-NEXT:    sext.b a7, a7
+; CHECK-NEXT:    slli t1, a1, 40
+; CHECK-NEXT:    srai a5, a5, 56
+; CHECK-NEXT:    srai a7, a7, 56
 ; CHECK-NEXT:    divw a5, a7, a5
-; CHECK-NEXT:    srli a7, a0, 16
-; CHECK-NEXT:    sext.b t1, t1
-; CHECK-NEXT:    sext.b a7, a7
+; CHECK-NEXT:    slli a7, a0, 40
+; CHECK-NEXT:    srai t1, t1, 56
+; CHECK-NEXT:    srai a7, a7, 56
 ; CHECK-NEXT:    divw a7, a7, t1
-; CHECK-NEXT:    srli a1, a1, 8
-; CHECK-NEXT:    srli a0, a0, 8
-; CHECK-NEXT:    sext.b a1, a1
-; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srai a1, a1, 56
+; CHECK-NEXT:    srai a0, a0, 56
 ; CHECK-NEXT:    divw a0, a0, a1
 ; CHECK-NEXT:    ppaire.b a1, a4, a2
 ; CHECK-NEXT:    ppaire.b a2, t0, a6
@@ -1834,19 +1834,19 @@ define <4 x i16> @test_psrem_h(<4 x i16> %a, <4 x i16> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srai a2, a1, 48
 ; CHECK-NEXT:    srai a3, a0, 48
-; CHECK-NEXT:    srli a4, a1, 32
+; CHECK-NEXT:    slli a4, a1, 16
 ; CHECK-NEXT:    sext.h a5, a1
 ; CHECK-NEXT:    remw a2, a3, a2
 ; CHECK-NEXT:    sext.h a3, a0
 ; CHECK-NEXT:    remw a3, a3, a5
-; CHECK-NEXT:    srli a5, a0, 32
-; CHECK-NEXT:    sext.h a4, a4
-; CHECK-NEXT:    sext.h a5, a5
+; CHECK-NEXT:    slli a5, a0, 16
+; CHECK-NEXT:    srai a4, a4, 48
+; CHECK-NEXT:    srai a5, a5, 48
 ; CHECK-NEXT:    remw a4, a5, a4
-; CHECK-NEXT:    srli a1, a1, 16
-; CHECK-NEXT:    srli a0, a0, 16
-; CHECK-NEXT:    sext.h a1, a1
-; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    slli a1, a1, 32
+; CHECK-NEXT:    slli a0, a0, 32
+; CHECK-NEXT:    srai a1, a1, 48
+; CHECK-NEXT:    srai a0, a0, 48
 ; CHECK-NEXT:    remw a0, a0, a1
 ; CHECK-NEXT:    ppaire.h a1, a4, a2
 ; CHECK-NEXT:    ppaire.h a0, a3, a0
@@ -1861,39 +1861,39 @@ define <8 x i8> @test_psrem_b(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    srai a2, a1, 56
 ; CHECK-NEXT:    srai a3, a0, 56
-; CHECK-NEXT:    srli a4, a1, 48
-; CHECK-NEXT:    srli a5, a0, 48
-; CHECK-NEXT:    srli a6, a1, 40
-; CHECK-NEXT:    srli a7, a0, 40
-; CHECK-NEXT:    srli t0, a1, 32
+; CHECK-NEXT:    slli a4, a1, 8
+; CHECK-NEXT:    slli a5, a0, 8
+; CHECK-NEXT:    slli a6, a1, 16
+; CHECK-NEXT:    slli a7, a0, 16
+; CHECK-NEXT:    slli t0, a1, 24
 ; CHECK-NEXT:    sext.b t1, a1
 ; CHECK-NEXT:    remw a2, a3, a2
 ; CHECK-NEXT:    sext.b a3, a0
 ; CHECK-NEXT:    remw a3, a3, t1
-; CHECK-NEXT:    srli t1, a0, 32
-; CHECK-NEXT:    sext.b a4, a4
-; CHECK-NEXT:    sext.b a5, a5
+; CHECK-NEXT:    slli t1, a0, 24
+; CHECK-NEXT:    srai a4, a4, 56
+; CHECK-NEXT:    srai a5, a5, 56
 ; CHECK-NEXT:    remw a4, a5, a4
-; CHECK-NEXT:    srli a5, a1, 24
-; CHECK-NEXT:    sext.b a6, a6
-; CHECK-NEXT:    sext.b a7, a7
+; CHECK-NEXT:    slli a5, a1, 32
+; CHECK-NEXT:    srai a6, a6, 56
+; CHECK-NEXT:    srai a7, a7, 56
 ; CHECK-NEXT:    remw a6, a7, a6
-; CHECK-NEXT:    srli a7, a0, 24
-; CHECK-NEXT:    sext.b t0, t0
-; CHECK-NEXT:    sext.b t1, t1
+; CHECK-NEXT:    slli a7, a0, 32
+; CHECK-NEXT:    srai t0, t0, 56
+; CHECK-NEXT:    srai t1, t1, 56
 ; CHECK-NEXT:    remw t0, t1, t0
-; CHECK-NEXT:    srli t1, a1, 16
-; CHECK-NEXT:    sext.b a5, a5
-; CHECK-NEXT:    sext.b a7, a7
+; CHECK-NEXT:    slli t1, a1, 40
+; CHECK-NEXT:    srai a5, a5, 56
+; CHECK-NEXT:    srai a7, a7, 56
 ; CHECK-NEXT:    remw a5, a7, a5
-; CHECK-NEXT:    srli a7, a0, 16
-; CHECK-NEXT:    sext.b t1, t1
-; CHECK-NEXT:    sext.b a7, a7
+; CHECK-NEXT:    slli a7, a0, 40
+; CHECK-NEXT:    srai t1, t1, 56
+; CHECK-NEXT:    srai a7, a7, 56
 ; CHECK-NEXT:    remw a7, a7, t1
-; CHECK-NEXT:    srli a1, a1, 8
-; CHECK-NEXT:    srli a0, a0, 8
-; CHECK-NEXT:    sext.b a1, a1
-; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srai a1, a1, 56
+; CHECK-NEXT:    srai a0, a0, 56
 ; CHECK-NEXT:    remw a0, a0, a1
 ; CHECK-NEXT:    ppaire.b a1, a4, a2
 ; CHECK-NEXT:    ppaire.b a2, t0, a6
diff --git a/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll b/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll
index 28ab573f59ac0..cdaed030e274c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/cttz-elts.ll
@@ -171,6 +171,23 @@ define i64 @i64_ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
   ret i64 %res
 }
 
+define i64 @i64_ctz_nxv16i1_range(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) vscale_range(2, 1024) {
+; RV32-LABEL: i64_ctz_nxv16i1_range:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV32-NEXT:    vfirst.m a0, v8
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: i64_ctz_nxv16i1_range:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; RV64-NEXT:    vfirst.m a0, v8
+; RV64-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+  ret i64 %res
+}
+
 define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
 ; RV32-LABEL: ctz_nxv16i1_poison:
 ; RV32:       # %bb.0:
@@ -192,20 +209,20 @@ define i32 @ctz_v16i1(<16 x i1> %pg, <16 x i1> %a) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV32-NEXT:    vfirst.m a0, v8
-; RV32-NEXT:    bgez a0, .LBB5_2
+; RV32-NEXT:    bgez a0, .LBB6_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a0, 16
-; RV32-NEXT:  .LBB5_2:
+; RV32-NEXT:  .LBB6_2:
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_v16i1:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; RV64-NEXT:    vfirst.m a0, v8
-; RV64-NEXT:    bgez a0, .LBB5_2
+; RV64-NEXT:    bgez a0, .LBB6_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a0, 16
-; RV64-NEXT:  .LBB5_2:
+; RV64-NEXT:  .LBB6_2:
 ; RV64-NEXT:    ret
   %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
   ret i32 %res
@@ -232,20 +249,20 @@ define i16 @ctz_v8i1_i16_ret(<8 x i1> %a) {
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV32-NEXT:    vfirst.m a0, v0
-; RV32-NEXT:    bgez a0, .LBB7_2
+; RV32-NEXT:    bgez a0, .LBB8_2
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    li a0, 8
-; RV32-NEXT:  .LBB7_2:
+; RV32-NEXT:  .LBB8_2:
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: ctz_v8i1_i16_ret:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; RV64-NEXT:    vfirst.m a0, v0
-; RV64-NEXT:    bgez a0, .LBB7_2
+; RV64-NEXT:    bgez a0, .LBB8_2
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    li a0, 8
-; RV64-NEXT:  .LBB7_2:
+; RV64-NEXT:  .LBB8_2:
 ; RV64-NEXT:    ret
   %res = call i16 @llvm.experimental.cttz.elts.i16.v8i1(<8 x i1> %a, i1 0)
   ret i16 %res
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/SPV_ALTERA_arbitrary_precision_integers.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/SPV_ALTERA_arbitrary_precision_integers.ll
index 9ea8a5709154c..a0451278f82cd 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/SPV_ALTERA_arbitrary_precision_integers.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/SPV_ALTERA_arbitrary_precision_integers.ll
@@ -8,6 +8,14 @@ define i13 @getConstantI13() {
   ret i13 42
 }
 
+define i96 @getConstantI96() {
+  ret i96 18446744073709551620
+}
+
+define i160 @getConstantI160() {
+  ret i160 3363637389930338837376336738763689377839373638
+}
+
 ;; Capabilities:
 ; CHECK-DAG: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
 ; CHECK-DAG: OpCapability ArbitraryPrecisionIntegersALTERA
@@ -17,14 +25,20 @@ define i13 @getConstantI13() {
 ;; Names:
 ; CHECK-DAG: OpName %[[#GET_I6:]] "getConstantI6"
 ; CHECK-DAG: OpName %[[#GET_I13:]] "getConstantI13"
+; CHECK-DAG: OpName %[[#GET_I96:]] "getConstantI96"
+; CHECK-DAG: OpName %[[#GET_I160:]] "getConstantI160"
 
 ; CHECK-NOT: DAG-FENCE
 
 ;; Types and Constants:
 ; CHECK-DAG: %[[#I6:]] = OpTypeInt 6 0
 ; CHECK-DAG: %[[#I13:]] = OpTypeInt 13 0
+; CHECK-DAG: %[[#I96:]] = OpTypeInt 96 0
+; CHECK-DAG: %[[#I160:]] = OpTypeInt 160 0
 ; CHECK-DAG: %[[#CST_I6:]] = OpConstant %[[#I6]] 2
 ; CHECK-DAG: %[[#CST_I13:]] = OpConstant %[[#I13]] 42
+; CHECK-DAG: %[[#CST_I96:]] = OpConstant %[[#I96]] 18446744073709551620
+; CHECK-DAG: %[[#CST_I160:]] = OpConstant %[[#I160]] 3363637389930338837376336738763689377839373638
 
 ; CHECK: %[[#GET_I6]] = OpFunction %[[#I6]]
 ; CHECK: OpReturnValue %[[#CST_I6]]
@@ -33,3 +47,11 @@ define i13 @getConstantI13() {
 ; CHECK: %[[#GET_I13]] = OpFunction %[[#I13]]
 ; CHECK: OpReturnValue %[[#CST_I13]]
 ; CHECK: OpFunctionEnd
+
+; CHECK: %[[#GET_I96]] = OpFunction %[[#I96]]
+; CHECK: OpReturnValue %[[#CST_I96]]
+; CHECK: OpFunctionEnd
+
+; CHECK: %[[#GET_I160]] = OpFunction %[[#I160]]
+; CHECK: OpReturnValue %[[#CST_I160]]
+; CHECK: OpFunctionEnd
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/apint-constant.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/apint-constant.ll
index 6ed6228653ff5..342ae8d39e3ed 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/apint-constant.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/apint-constant.ll
@@ -6,15 +6,15 @@
 ; CHECK-DAG: %[[#INT128:]] = OpTypeInt 128 0
 ; CHECK-DAG: %[[#INT96:]] = OpTypeInt 96 0
 ; CHECK-DAG: %[[#INT97:]] = OpTypeInt 97 0
-; CHECK-DAG: %[[#NEG128:]] = OpConstant %[[#INT128]] 4294965247 4294967295 4294967295 4294967295
-; CHECK-DAG: %[[#ONE128:]] = OpConstant %[[#INT128]] 1 0 0 0
-; CHECK-DAG: %[[#BOUNDARY:]] = OpConstant %[[#INT128]] 4294967295 4294967295 0 0
+; CHECK-DAG: %[[#BOUNDARY:]] = OpConstant %[[#INT128]] 18446744073709551615
+; CHECK-DAG: %[[#ONE128:]] = OpConstant %[[#INT128]] 1
+; CHECK-DAG: %[[#NEG128:]] = OpConstant %[[#INT128]] 340282366920938463463374607431768209407
 ; CHECK-DAG: %[[#ZERO128:]] = OpConstantNull %[[#INT128]]
-; CHECK-DAG: %[[#NEG96:]] = OpConstant %[[#INT96]] 4294967295 4294967295 4294967295
-; CHECK-DAG: %[[#OVER64:]] = OpConstant %[[#INT96]] 1 0 1
-; CHECK-DAG: %[[#NEG97:]] = OpConstant %[[#INT97]] 4294967295 4294967295 4294967295 1
-; CHECK-DAG: %[[#OVER64_I97:]] = OpConstant %[[#INT97]] 1 0 1 0
-; CHECK-DAG: %[[#I97_MAX:]] = OpConstant %[[#INT97]] 0 0 0 1
+; CHECK-DAG: %[[#OVER64:]] = OpConstant %[[#INT96]] 18446744073709551617
+; CHECK-DAG: %[[#NEG96:]] = OpConstant %[[#INT96]] 79228162514264337593543950335
+; CHECK-DAG: %[[#I97_MAX:]] = OpConstant %[[#INT97]] 79228162514264337593543950336
+; CHECK-DAG: %[[#OVER64_I97:]] = OpConstant %[[#INT97]] 18446744073709551617
+; CHECK-DAG: %[[#NEG97:]] = OpConstant %[[#INT97]] 158456325028528675187087900671
 
 ; CHECK: OpStore %[[#]] %[[#NEG128]] Aligned 16
 ; CHECK: OpStore %[[#]] %[[#ONE128]] Aligned 16
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
index c90ffdd17996c..f733c8f9ce09b 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
@@ -10,7 +10,7 @@
 ; CHECK: OpName %[[#TestAdd:]] "test_add"
 ; CHECK: OpName %[[#TestSub:]] "test_sub"
 ; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
-; CHECK: %[[#Const64Int128:]] = OpConstant %[[#Int128Ty]] 64 0 0 0
+; CHECK: %[[#Const64Int128:]] = OpConstant %[[#Int128Ty]] 64
 
 ; CHECK: %[[#TestAdd]] = OpFunction
 define spir_func void @test_add(i64 %AL, i64 %AH, i64 %BL, i64 %BH, ptr %RL, ptr %RH) {
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-no-extension.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-no-extension.ll
index f3e940f1a5ff2..2270d384834fb 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-no-extension.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-no-extension.ll
@@ -1,3 +1,7 @@
+; Test that llvm.masked.gather produces an error when the
+; SPV_INTEL_masked_gather_scatter extension is not enabled, since vector of
+; pointers is not supported in SPIR-V without this extension.
+
 ; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
 
 declare <4 x i32> @llvm.masked.gather.v4i32.v4p1(<4 x ptr addrspace(1)>, i32, <4 x i1>, <4 x i32>)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-scatter.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-scatter.ll
index add08059d0255..91c8713645dbf 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-scatter.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-gather-scatter.ll
@@ -1,3 +1,7 @@
+; Test that llvm.masked.gather and llvm.masked.scatter intrinsics are correctly
+; lowered to OpMaskedGatherINTEL and OpMaskedScatterINTEL SPIR-V instructions
+; when the SPV_INTEL_masked_gather_scatter extension is enabled.
+
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_masked_gather_scatter %s -o - | FileCheck %s
 ; TODO: spirv-val does not support vector operands in OpConvertPtrToU and OpConvertUToPtr with SPV_INTEL_masked_gather_scatter
 ; RUNx: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_masked_gather_scatter %s -o - -filetype=obj | spirv-val %}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-scatter-no-extension.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-scatter-no-extension.ll
index 4cbef7f905047..b76b41dd5b62b 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-scatter-no-extension.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/masked-scatter-no-extension.ll
@@ -1,3 +1,7 @@
+; Test that llvm.masked.scatter produces an error when the
+; SPV_INTEL_masked_gather_scatter extension is not enabled, since vector of
+; pointers is not supported in SPIR-V without this extension.
+
 ; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
 
 declare void @llvm.masked.scatter.v4i32.v4p1(<4 x i32>, <4 x ptr addrspace(1)>, i32, <4 x i1>)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-no-extension.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-no-extension.ll
index d892b3487b725..73b801688857a 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-no-extension.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-no-extension.ll
@@ -1,3 +1,6 @@
+; Test that ptrtoint on a vector of pointers without the
+; SPV_INTEL_masked_gather_scatter extension produces an error.
+
 ; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
 
 ; CHECK: error:{{.*}}Vector of pointers requires SPV_INTEL_masked_gather_scatter extension
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-ptrtoint.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-ptrtoint.ll
index 74988e07b537b..5a5d0b4057ca3 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-ptrtoint.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_masked_gather_scatter/vector-of-pointers-ptrtoint.ll
@@ -1,3 +1,7 @@
+; Test that ptrtoint and inttoptr on vectors of pointers are correctly lowered
+; to OpConvertPtrToU and OpConvertUToPtr when SPV_INTEL_masked_gather_scatter
+; extension is enabled.
+
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_masked_gather_scatter %s -o - | FileCheck %s
 ; TODO: spirv-val does not support vector operands in OpConvertPtrToU and OpConvertUToPtr with SPV_INTEL_masked_gather_scatter
 ; RUNx: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_INTEL_masked_gather_scatter %s -o - -filetype=obj | spirv-val %}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/unbounded-arr.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/unbounded-arr.ll
new file mode 100644
index 0000000000000..2b0394c356aa3
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/unbounded-arr.ll
@@ -0,0 +1,29 @@
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv1.6-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpCapability RuntimeDescriptorArrayEXT
+; CHECK-DAG: %[[int32:[0-9]+]] = OpTypeInt 32 0
+; CHECK-DAG: %[[rwbuffer:[0-9]+]] = OpTypeImage %[[int32]] Buffer 2 0 0 2 R32i
+; CHECK-DAG: OpTypeRuntimeArray %[[rwbuffer]]
+
+; This IR was emmited from the following HLSL code:
+; [[vk::binding(0)]]
+; RWBuffer<int> Buf[] : register(u0);
+; 
+; [numthreads(4,2,1)]
+; void main(uint GI : SV_GroupIndex) {
+;     Buf[0][0] = 0;
+; }
+
+ at Buf.str = private unnamed_addr constant [4 x i8] c"Buf\00", align 1
+
+; Function Attrs: convergent noinline norecurse
+define void @main() #0 {
+entry:
+  %binding = call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 24) @llvm.spv.resource.handlefrombinding(i32 0, i32 0, i32 0, i32 0, ptr @Buf.str)
+  %pointer = call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 24)  %binding, i32 0)
+  store i32 0, ptr addrspace(11) %pointer, align 4
+  ret void
+}
+
+attributes #0 = { convergent noinline norecurse "hlsl.numthreads"="4,2,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/instructions/icmp.ll b/llvm/test/CodeGen/SPIRV/instructions/icmp.ll
index 0d8778af9e5b0..e1f2e246dabe8 100644
--- a/llvm/test/CodeGen/SPIRV/instructions/icmp.ll
+++ b/llvm/test/CodeGen/SPIRV/instructions/icmp.ll
@@ -22,7 +22,13 @@
 ; CHECK-DAG: OpName [[v3SGT:%.*]] "test_v3_sgt"
 ; CHECK-DAG: OpName [[v3UGE:%.*]] "test_v3_uge"
 ; CHECK-DAG: OpName [[v3SGE:%.*]] "test_v3_sge"
-; CHECK-DAG: OpName [[v16NE:%.*]] "test_v16_ne"
+
+; CHECK-DAG: OpName [[v16NE:%.*]] "test_boolean_v16_ne"
+; CHECK-DAG: OpName [[v16EQ:%.*]] "test_boolean_v16_eq"
+
+; CHECK-DAG: [[Bool:%.*]] = OpTypeBool
+; CHECK-DAG: [[v16:%.*]] = OpTypeVector [[Bool]] 16
+; CHECK-DAG: [[Null:%.*]] = OpConstantNull [[v16]]
 
 ; CHECK:      [[EQ]] = OpFunction
 ; CHECK-NEXT: [[A:%.*]] = OpFunctionParameter
@@ -266,11 +272,20 @@ define <3 x i1> @test_v3_sge(<3 x i32> %a, <3 x i32> %b) {
 
 ; CHECK:      [[v16NE]] = OpFunction
 ; CHECK-NEXT: OpLabel
-; CHECK-NEXT: [[R:%.*]] = OpLogicalNotEqual {{%.+}} {{%.*}} {{%.*}}
+; CHECK-NEXT: [[R:%.*]] = OpLogicalNotEqual {{%.+}} [[Null]] [[Null]]
 ; CHECK-NEXT: OpReturnValue [[R]]
 ; CHECK-NEXT: OpFunctionEnd
-
-define spir_func <16 x i1> @test_v16_ne() {
+define spir_func <16 x i1> @test_boolean_v16_ne() {
   %A = icmp ne <16 x i1> zeroinitializer, zeroinitializer
   ret <16 x i1> %A
 }
+
+; CHECK:      [[v16EQ]] = OpFunction
+; CHECK-NEXT: OpLabel
+; CHECK-NEXT: [[R:%.*]] = OpLogicalEqual {{%.+}} [[Null]] [[Null]]
+; CHECK-NEXT: OpReturnValue [[R]]
+; CHECK-NEXT: OpFunctionEnd
+define spir_func <16 x i1> @test_boolean_v16_eq() {
+  %A = icmp eq <16 x i1> zeroinitializer, zeroinitializer
+  ret <16 x i1> %A
+}
diff --git a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
index eb1128ac5417a..cad07e2557614 100644
--- a/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
+++ b/llvm/test/CodeGen/SPIRV/llc-pipeline.ll
@@ -145,6 +145,9 @@
 ; SPIRV-Opt-NEXT:    FunctionPass Manager
 ; SPIRV-Opt-NEXT:      Dominator Tree Construction
 ; SPIRV-Opt-NEXT:      Natural Loop Information
+; SPIRV-Opt-NEXT:      Post-Dominator Tree Construction
+; SPIRV-Opt-NEXT:      Branch Probability Analysis
+; SPIRV-Opt-NEXT:      Block Frequency Analysis
 ; SPIRV-Opt-NEXT:      CodeGen Prepare
 ; SPIRV-Opt-NEXT:      Lower invoke and unwind, for unwindless code generators
 ; SPIRV-Opt-NEXT:      Remove unreachable blocks from the CFG
diff --git a/llvm/test/CodeGen/SPIRV/memory-model-md-glsl450.ll b/llvm/test/CodeGen/SPIRV/memory-model-md-glsl450.ll
new file mode 100644
index 0000000000000..649c179539cd9
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/memory-model-md-glsl450.ll
@@ -0,0 +1,16 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpMemoryModel Logical GLSL450
+define void @main() {
+entry:
+  ret void
+}
+
+; AddressingModel=Logical (0), MemoryModel=GLSL450 (1)
+!spirv.MemoryModel = !{!0}
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/SPIRV/memory-model-md-opencl.ll b/llvm/test/CodeGen/SPIRV/memory-model-md-opencl.ll
new file mode 100644
index 0000000000000..3bf83a6248e44
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/memory-model-md-opencl.ll
@@ -0,0 +1,14 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpMemoryModel Logical OpenCL
+define void @main() {
+entry:
+  ret void
+}
+
+; AddressingModel=Logical (0), MemoryModel=OpenCL (2)
+!spirv.MemoryModel = !{!0}
+!0 = !{i32 0, i32 2}
diff --git a/llvm/test/CodeGen/SPIRV/memory-model-md-shader.ll b/llvm/test/CodeGen/SPIRV/memory-model-md-shader.ll
new file mode 100644
index 0000000000000..1ed3d28e38477
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/memory-model-md-shader.ll
@@ -0,0 +1,16 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan-compute %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpMemoryModel Physical32 Simple
+define void @main() {
+entry:
+  ret void
+}
+
+; AddressingModel=Physical32 (1), MemoryModel=Simple (0)
+!spirv.MemoryModel = !{!0}
+!0 = !{i32 1, i32 0}
diff --git a/llvm/test/CodeGen/SPIRV/memory-model-md-unknown.ll b/llvm/test/CodeGen/SPIRV/memory-model-md-unknown.ll
new file mode 100644
index 0000000000000..5b0e9667aec89
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/memory-model-md-unknown.ll
@@ -0,0 +1,12 @@
+; RUN: not --crash llc -O0 -mtriple=spirv32-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+; RUN: not --crash llc -O0 -mtriple=spirv64-unknown-unknown %s -o /dev/null 2>&1 | FileCheck %s
+
+; CHECK: LLVM ERROR: Unknown memory model in spirv.MemoryModel metadata
+define void @main() {
+entry:
+  ret void
+}
+
+; AddressingModel=Logical (0), MemoryModel=Unknown (99)
+!spirv.MemoryModel = !{!0}
+!0 = !{i32 0, i32 99}
diff --git a/llvm/test/CodeGen/SPIRV/memory-model-md-vulkan.ll b/llvm/test/CodeGen/SPIRV/memory-model-md-vulkan.ll
new file mode 100644
index 0000000000000..cbee31481ea42
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/memory-model-md-vulkan.ll
@@ -0,0 +1,14 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpMemoryModel Logical Vulkan
+define void @main() {
+entry:
+  ret void
+}
+
+; AddressingModel=Logical (0), MemoryModel=VulkanKHR (3)
+!spirv.MemoryModel = !{!0}
+!0 = !{i32 0, i32 3}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-vector-index.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-vector-index.ll
new file mode 100644
index 0000000000000..0f710f9827679
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-vector-index.ll
@@ -0,0 +1,23 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#INT32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#PTR_INT32:]] = OpTypePointer CrossWorkgroup %[[#INT32]]
+; CHECK-DAG: %[[#INT8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#PTR_INT8:]] = OpTypePointer CrossWorkgroup %[[#INT8]]
+; CHECK-DAG: %[[#INT64:]] = OpTypeInt 64 0
+; CHECK-DAG: %[[#CONST_0:]] = OpConstantNull %[[#INT64]]
+; CHECK-LABEL: Begin function test_vector_gep_with_load
+; CHECK: %[[#BC1:]] = OpBitcast %[[#PTR_INT8]] %[[#]]
+; CHECK: %[[#GEP:]] = OpPtrAccessChain %[[#PTR_INT8]] %[[#BC1]] %[[#CONST_0]]
+; CHECK: %[[#BC2:]] = OpBitcast %[[#PTR_INT32]] %[[#GEP]]
+; CHECK: %[[#VAL:]] = OpLoad %[[#INT32]] %[[#BC2]]
+; CHECK: OpStore %[[#]] %[[#VAL]]
+; CHECK: OpFunctionEnd
+define spir_kernel void @test_vector_gep_with_load(ptr addrspace(1) %p, ptr addrspace(1) %out) {
+  %gep = getelementptr i32, ptr addrspace(1) %p, <1 x i64> zeroinitializer
+  %elem = extractelement <1 x ptr addrspace(1)> %gep, i32 0
+  %val = load i32, ptr addrspace(1) %elem
+  store i32 %val, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-vector-from-array-of-vectors.ll b/llvm/test/CodeGen/SPIRV/pointers/load-vector-from-array-of-vectors.ll
new file mode 100644
index 0000000000000..5a9a1cb4a4fb7
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/load-vector-from-array-of-vectors.ll
@@ -0,0 +1,47 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan %s -stop-after=spirv-legalize-bitcast -o - | FileCheck %s --check-prefix=IRCHECK
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-vulkan %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-vulkan1.3-compute %s -o - -filetype=obj | spirv-val --target-env vulkan1.3 %}
+
+; CHECK-DAG: [[FLOAT:%[0-9]+]] = OpTypeFloat 32
+; CHECK-DAG: [[VEC2FLOAT:%[0-9]+]] = OpTypeVector [[FLOAT]] 2
+; CHECK: OpLoad [[VEC2FLOAT]]
+; CHECK: OpCompositeExtract [[FLOAT]]
+; CHECK: OpCompositeConstruct [[VEC2FLOAT]]
+
+; // M1[0][0]
+; IRCHECK: [[M1ROWZero:%[0-9]+]] = call ptr addrspace(10) (i1, ptr addrspace(10), ...) @llvm.spv.gep.p10.p10(i1 false, ptr addrspace(10) @M1, i32 0, i32 0)
+; IRCHECK: [[M1ROWZeroVec:%[0-9]+]] = load <2 x float>, ptr addrspace(10) [[M1ROWZero]], align 8
+; IRCHECK: [[M1Elem_00:%[0-9]+]] = call float @llvm.spv.extractelt.f32.v2f32.i32(<2 x float> [[M1ROWZeroVec]], i32 0)
+
+;// M1[0][1]
+; IRCHECK: [[M1ROWZero_2:%[0-9]+]] = call ptr addrspace(10) (i1, ptr addrspace(10), ...) @llvm.spv.gep.p10.p10(i1 false, ptr addrspace(10) @M1, i32 0, i32 0)
+; IRCHECK: [[M1ROWZeroVec_2:%[0-9]+]] = load <2 x float>, ptr addrspace(10) [[M1ROWZero_2]], align 8
+; IRCHECK: [[M1Elem_01:%[0-9]+]] = call float @llvm.spv.extractelt.f32.v2f32.i32(<2 x float> [[M1ROWZeroVec_2]], i32 1)
+
+; // M1[1][0]
+; IRCHECK: [[M1ROWOne:%[0-9]+]] = call ptr addrspace(10) (i1, ptr addrspace(10), ...) @llvm.spv.gep.p10.p10(i1 false, ptr addrspace(10) @M1, i32 0, i32 1)
+; IRCHECK: [[M1ROWOneVec:%[0-9]+]] = load <2 x float>, ptr addrspace(10) [[M1ROWOne]], align 8
+; IRCHECK: [[M1Elem_10:%[0-9]+]] = call float @llvm.spv.extractelt.f32.v2f32.i32(<2 x float> [[M1ROWOneVec]], i32 0)
+
+; // M1[1][1]
+; IRCHECK: [[M1ROWOne_2:%[0-9]+]] = call ptr addrspace(10) (i1, ptr addrspace(10), ...) @llvm.spv.gep.p10.p10(i1 false, ptr addrspace(10) @M1, i32 0, i32 1)
+; IRCHECK: [[M1ROWOneVec_2:%[0-9]+]] = load <2 x float>, ptr addrspace(10) [[M1ROWOne_2]], align 8
+; IRCHECK: [[M1Elem_11:%[0-9]+]] = call float @llvm.spv.extractelt.f32.v2f32.i32(<2 x float> [[M1ROWOneVec_2]], i32 1)
+
+; // M1[2][0]
+; IRCHECK: [[M1ROWTwo:%[0-9]+]] = call ptr addrspace(10) (i1, ptr addrspace(10), ...) @llvm.spv.gep.p10.p10(i1 false, ptr addrspace(10) @M1, i32 0, i32 2)
+; IRCHECK: [[M1ROWTwoVec:%[0-9]+]] = load <2 x float>, ptr addrspace(10) [[M1ROWTwo]], align 8
+; IRCHECK: [[M1Elem_20:%[0-9]+]] = call float @llvm.spv.extractelt.f32.v2f32.i32(<2 x float> [[M1ROWTwoVec]], i32 0)
+
+ at M1 = internal addrspace(10) global [4 x <2 x float>] zeroinitializer, align 4
+ at OUT = internal addrspace(10) global <2 x float> zeroinitializer, align 4
+
+define spir_func void @main() #1 {
+entry:
+  %0 = load <5 x float>, ptr addrspace(10) @M1, align 4
+  %1 = shufflevector <5 x float> %0, <5 x float> poison, <2 x i32> <i32 0, i32 4>
+  store <2 x float> %1, ptr addrspace(10) @OUT, align 4
+  ret void
+}
+
+attributes #1 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SPIRV/single-element-vector-nested-aggregate.ll b/llvm/test/CodeGen/SPIRV/single-element-vector-nested-aggregate.ll
new file mode 100644
index 0000000000000..48a59ad27f5ef
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/single-element-vector-nested-aggregate.ll
@@ -0,0 +1,129 @@
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Verify that <1 x T> nested inside aggregate types is scalarized to T.
+
+; CHECK-NOT: OpTypeVector
+; CHECK-DAG: %[[#FloatTy:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#IntTy:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#PtrFloat:]] = OpTypePointer Function %[[#FloatTy]]
+; CHECK-DAG: %[[#Const8:]] = OpConstant %[[#IntTy]] 8
+; CHECK-DAG: %[[#Const4:]] = OpConstant %[[#IntTy]] 4
+; CHECK-DAG: %[[#Const2:]] = OpConstant %[[#IntTy]] 2
+; CHECK-DAG: %[[#Float1:]] = OpConstant %[[#FloatTy]] 1
+; CHECK-DAG: %[[#Float2:]] = OpConstant %[[#FloatTy]] 2
+; CHECK-DAG: %[[#Float3:]] = OpConstant %[[#FloatTy]] 3
+; CHECK-DAG: %[[#Int42:]] = OpConstant %[[#IntTy]] 42
+; CHECK-DAG: %[[#Int7:]] = OpConstant %[[#IntTy]] 7
+; CHECK-DAG: %[[#Arr8Float:]] = OpTypeArray %[[#FloatTy]] %[[#Const8]]
+; CHECK-DAG: %[[#PtrArr8Float:]] = OpTypePointer Function %[[#Arr8Float]]
+; CHECK-DAG: %[[#Arr4x8Float:]] = OpTypeArray %[[#Arr8Float]] %[[#Const4]]
+; CHECK-DAG: %[[#Arr4x4x8Float:]] = OpTypeArray %[[#Arr4x8Float]] %[[#Const4]]
+; CHECK-DAG: %[[#PtrArr4x4x8Float:]] = OpTypePointer Function %[[#Arr4x4x8Float]]
+; CHECK-DAG: %[[#StructFloatInt:]] = OpTypeStruct %[[#FloatTy]] %[[#IntTy]]
+; CHECK-DAG: %[[#PtrStructFloatInt:]] = OpTypePointer Function %[[#StructFloatInt]]
+; CHECK-DAG: %[[#Arr4Float:]] = OpTypeArray %[[#FloatTy]] %[[#Const4]]
+; CHECK-DAG: %[[#Arr2Int:]] = OpTypeArray %[[#IntTy]] %[[#Const2]]
+; CHECK-DAG: %[[#StructFloatArr2Int:]] = OpTypeStruct %[[#FloatTy]] %[[#Arr2Int]]
+
+; CHECK: OpFunction
+; CHECK: %[[#ArrVar:]] = OpVariable %[[#PtrArr8Float]] Function
+; CHECK: %[[#ArrGep:]] = OpPtrAccessChain %[[#PtrFloat]] %[[#ArrVar]]
+; CHECK: OpStore %[[#ArrGep]] %[[#Float1]] Aligned 4
+; CHECK: %[[#ArrLoad:]] = OpLoad %[[#FloatTy]] %[[#ArrGep]] Aligned 4
+; CHECK: OpStore %[[#]] %[[#ArrLoad]] Aligned 4
+; CHECK: OpFunctionEnd
+define spir_kernel void @vec1_in_array(ptr addrspace(1) %out) {
+entry:
+  %v = alloca [8 x <1 x float>], align 4, addrspace(0)
+  %p = getelementptr [8 x <1 x float>], ptr addrspace(0) %v, i32 0, i32 0
+  store <1 x float> <float 1.0>, ptr addrspace(0) %p, align 4
+  %r = load <1 x float>, ptr addrspace(0) %p, align 4
+  %s = extractelement <1 x float> %r, i32 0
+  store float %s, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#NestedVar:]] = OpVariable %[[#PtrArr4x4x8Float]] Function
+; CHECK: %[[#NestedGep:]] = OpPtrAccessChain %[[#PtrFloat]] %[[#NestedVar]]
+; CHECK: OpStore %[[#NestedGep]] %[[#Float2]] Aligned 4
+; CHECK: %[[#NestedLoad:]] = OpLoad %[[#FloatTy]] %[[#NestedGep]] Aligned 4
+; CHECK: OpStore %[[#]] %[[#NestedLoad]] Aligned 4
+; CHECK: OpFunctionEnd
+define spir_kernel void @vec1_in_nested_array(ptr addrspace(1) %out) {
+entry:
+  %v = alloca [4 x [4 x [8 x <1 x float>]]], align 4, addrspace(0)
+  %p = getelementptr [4 x [4 x [8 x <1 x float>]]], ptr addrspace(0) %v, i32 0, i32 0, i32 0, i32 0
+  store <1 x float> <float 2.0>, ptr addrspace(0) %p, align 4
+  %r = load <1 x float>, ptr addrspace(0) %p, align 4
+  %s = extractelement <1 x float> %r, i32 0
+  store float %s, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#StructVar:]] = OpVariable %[[#PtrStructFloatInt]] Function
+; CHECK: %[[#StructGep:]] = OpPtrAccessChain %[[#PtrFloat]] %[[#StructVar]]
+; CHECK: OpStore %[[#StructGep]] %[[#Float1]] Aligned 4
+; CHECK: %[[#StructLoad:]] = OpLoad %[[#FloatTy]] %[[#StructGep]] Aligned 4
+; CHECK: OpStore %[[#]] %[[#StructLoad]] Aligned 4
+; CHECK: OpFunctionEnd
+define spir_kernel void @vec1_in_struct(ptr addrspace(1) %out) {
+entry:
+  %v = alloca {<1 x float>, <1 x i32>}, align 4, addrspace(0)
+  %p = getelementptr {<1 x float>, <1 x i32>}, ptr addrspace(0) %v, i32 0, i32 0
+  store <1 x float> <float 1.0>, ptr addrspace(0) %p, align 4
+  %r = load <1 x float>, ptr addrspace(0) %p, align 4
+  %s = extractelement <1 x float> %r, i32 0
+  store float %s, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#ArrInsert1:]] = OpCompositeInsert %[[#Arr4Float]] %[[#Float1]] %[[#]] 0
+; CHECK: %[[#ArrInsert2:]] = OpCompositeInsert %[[#Arr4Float]] %[[#Float2]] %[[#ArrInsert1]] 1
+; CHECK: %[[#ArrExtract:]] = OpCompositeExtract %[[#FloatTy]] %[[#ArrInsert2]] 1
+; CHECK: OpStore %[[#]] %[[#ArrExtract]] Aligned 4
+; CHECK: OpFunctionEnd
+define spir_kernel void @vec1_insertvalue_extractvalue_array(ptr addrspace(1) %out) {
+entry:
+  %a = insertvalue [4 x <1 x float>] poison, <1 x float> <float 1.0>, 0
+  %a2 = insertvalue [4 x <1 x float>] %a, <1 x float> <float 2.0>, 1
+  %v = extractvalue [4 x <1 x float>] %a2, 1
+  %s = extractelement <1 x float> %v, i32 0
+  store float %s, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#StructInsert1:]] = OpCompositeInsert %[[#StructFloatInt]] %[[#Float1]] %[[#]] 0
+; CHECK: %[[#StructInsert2:]] = OpCompositeInsert %[[#StructFloatInt]] %[[#Int42]] %[[#StructInsert1]] 1
+; CHECK: %[[#StructExtract:]] = OpCompositeExtract %[[#FloatTy]] %[[#StructInsert2]] 0
+; CHECK: OpStore %[[#]] %[[#StructExtract]] Aligned 4
+; CHECK: OpFunctionEnd
+define spir_kernel void @vec1_insertvalue_extractvalue_struct(ptr addrspace(1) %out) {
+entry:
+  %a = insertvalue {<1 x float>, <1 x i32>} poison, <1 x float> <float 1.0>, 0
+  %a2 = insertvalue {<1 x float>, <1 x i32>} %a, <1 x i32> <i32 42>, 1
+  %v = extractvalue {<1 x float>, <1 x i32>} %a2, 0
+  %s = extractelement <1 x float> %v, i32 0
+  store float %s, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: OpFunction
+; CHECK: %[[#MixedInsert1:]] = OpCompositeInsert %[[#StructFloatArr2Int]] %[[#Float3]] %[[#]] 0
+; CHECK: %[[#MixedInsert2:]] = OpCompositeInsert %[[#StructFloatArr2Int]] %[[#Int7]] %[[#MixedInsert1]] 1 0
+; CHECK: %[[#MixedExtract:]] = OpCompositeExtract %[[#FloatTy]] %[[#MixedInsert2]] 0
+; CHECK: OpStore %[[#]] %[[#MixedExtract]] Aligned 4
+; CHECK: OpFunctionEnd
+define spir_kernel void @vec1_struct_with_nested_array(ptr addrspace(1) %out) {
+entry:
+  %s = insertvalue {<1 x float>, [2 x <1 x i32>]} poison, <1 x float> <float 3.0>, 0
+  %s2 = insertvalue {<1 x float>, [2 x <1 x i32>]} %s, <1 x i32> <i32 7>, 1, 0
+  %v = extractvalue {<1 x float>, [2 x <1 x i32>]} %s2, 0
+  %sc = extractelement <1 x float> %v, i32 0
+  store float %sc, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/undef-global-aggregate-initializer.ll b/llvm/test/CodeGen/SPIRV/undef-global-aggregate-initializer.ll
new file mode 100644
index 0000000000000..70f04ebb0f95c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/undef-global-aggregate-initializer.ll
@@ -0,0 +1,73 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-pc-vulkan1.3-library %s -o - -filetype=obj | spirv-val %}
+
+%struct.simple = type { i8 }
+ at g_simple = private unnamed_addr addrspace(2) constant %struct.simple poison, align 1
+
+%struct.multi = type { i32, float, i8 }
+ at g_multi = private addrspace(2) constant %struct.multi poison, align 4
+
+ at g_arr = private addrspace(2) constant [3 x i32] poison, align 4
+
+%struct.inner = type { i32 }
+%struct.outer = type { %struct.inner, float }
+ at g_nested = private addrspace(2) constant %struct.outer poison, align 4
+
+%struct.mixed = type { i32, float }
+ at g_mixed = private addrspace(2) constant %struct.mixed { i32 poison, float 1.0 }, align 4
+
+%struct.with_arr = type { [2 x i32], float }
+ at g_struct_with_arr = private addrspace(2) constant %struct.with_arr poison, align 4
+
+ at g_arr_of_struct = private addrspace(2) constant [2 x %struct.with_arr] poison, align 4
+
+define spir_func void @foo() {
+entry:
+  ret void
+}
+
+; CHECK-DAG: %[[#I32:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#F32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#I8:]] = OpTypeInt 8 0
+; CHECK-DAG: %[[#I32_ARR2:]] = OpTypeArray %[[#I32]] %[[#]]
+; CHECK-DAG: %[[#MULTI:]] = OpTypeStruct %[[#I32]] %[[#F32]] %[[#I8]]
+; CHECK-DAG: %[[#SIMPLE:]] = OpTypeStruct %[[#I8]]{{$}}
+; CHECK-DAG: %[[#ARR:]] = OpTypeArray %[[#I32]] %[[#]]
+; CHECK-DAG: %[[#INNER:]] = OpTypeStruct %[[#I32]]{{$}}
+; CHECK-DAG: %[[#WITH_ARR:]] = OpTypeStruct %[[#I32_ARR2]] %[[#F32]]
+
+; CHECK-DAG: %[[#OUTER:]] = OpTypeStruct %[[#INNER]] %[[#F32]]
+; CHECK-DAG: %[[#MIXED:]] = OpTypeStruct %[[#I32]] %[[#F32]]{{$}}
+; CHECK-DAG: %[[#ARR_OF_STRUCT:]] = OpTypeArray %[[#WITH_ARR]] %[[#]]
+
+; CHECK-DAG: %[[#ARR_OF_STRUCT_PTR:]] = OpTypePointer UniformConstant %[[#ARR_OF_STRUCT]]
+; CHECK-DAG: %[[#WITH_ARR_PTR:]] = OpTypePointer UniformConstant %[[#WITH_ARR]]
+; CHECK-DAG: %[[#MIXED_PTR:]] = OpTypePointer UniformConstant %[[#MIXED]]
+; CHECK-DAG: %[[#OUTER_PTR:]] = OpTypePointer UniformConstant %[[#OUTER]]
+; CHECK-DAG: %[[#ARR_PTR:]] = OpTypePointer UniformConstant %[[#ARR]]
+; CHECK-DAG: %[[#MULTI_PTR:]] = OpTypePointer UniformConstant %[[#MULTI]]
+; CHECK-DAG: %[[#SIMPLE_PTR:]] = OpTypePointer UniformConstant %[[#SIMPLE]]
+
+; CHECK-DAG: %[[#CONST_F32:]] = OpConstant %[[#F32]] 1
+; CHECK-DAG: %[[#UNDEF_I8:]] = OpUndef %[[#I8]]
+; CHECK-DAG: %[[#UNDEF_I32:]] = OpUndef %[[#I32]]
+; CHECK-DAG: %[[#UNDEF_F32:]] = OpUndef %[[#F32]]
+; CHECK-DAG: %[[#UNDEF_INNER:]] = OpUndef %[[#INNER]]
+; CHECK-DAG: %[[#UNDEF_I32_ARR2:]] = OpUndef %[[#I32_ARR2]]
+; CHECK-DAG: %[[#UNDEF_WITH_ARR:]] = OpUndef %[[#WITH_ARR]]
+
+; CHECK-DAG: OpConstantComposite %[[#SIMPLE]] %[[#UNDEF_I8]]
+; CHECK-DAG: OpVariable %[[#SIMPLE_PTR]] UniformConstant
+; CHECK-DAG: OpConstantComposite %[[#MULTI]] %[[#UNDEF_I32]] %[[#UNDEF_F32]] %[[#UNDEF_I8]]
+; CHECK-DAG: OpVariable %[[#MULTI_PTR]] UniformConstant
+; CHECK-DAG: OpConstantComposite %[[#ARR]] %[[#UNDEF_I32]] %[[#UNDEF_I32]] %[[#UNDEF_I32]]
+; CHECK-DAG: OpVariable %[[#ARR_PTR]] UniformConstant
+; CHECK-DAG: OpConstantComposite %[[#OUTER]] %[[#UNDEF_INNER]] %[[#UNDEF_F32]]
+; CHECK-DAG: OpVariable %[[#OUTER_PTR]] UniformConstant
+; CHECK-DAG: OpConstantComposite %[[#MIXED]] %[[#UNDEF_I32]] %[[#CONST_F32]]
+; CHECK-DAG: OpVariable %[[#MIXED_PTR]] UniformConstant
+; CHECK-DAG: OpConstantComposite %[[#WITH_ARR]] %[[#UNDEF_I32_ARR2]] %[[#UNDEF_F32]]
+; CHECK-DAG: OpVariable %[[#WITH_ARR_PTR]] UniformConstant
+; CHECK-DAG: OpConstantComposite %[[#ARR_OF_STRUCT]] %[[#UNDEF_WITH_ARR]] %[[#UNDEF_WITH_ARR]]
+; CHECK-DAG: OpVariable %[[#ARR_OF_STRUCT_PTR]] UniformConstant
diff --git a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
index 24209b45e302d..52921f0f347b9 100644
--- a/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
+++ b/llvm/test/CodeGen/Thumb/smul_fix_sat.ll
@@ -421,69 +421,68 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
 ; ARM-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
 ; ARM-NEXT:    mov r3, r5
 ; ARM-NEXT:    bl __aeabi_lmul
-; ARM-NEXT:    adds r0, r0, r7
+; ARM-NEXT:    adds r3, r0, r7
 ; ARM-NEXT:    adcs r1, r6
-; ARM-NEXT:    rsbs r5, r1, #0
-; ARM-NEXT:    adcs r5, r1
-; ARM-NEXT:    movs r2, #1
-; ARM-NEXT:    str r0, [sp, #16] @ 4-byte Spill
-; ARM-NEXT:    cmp r0, #0
-; ARM-NEXT:    mov r3, r2
-; ARM-NEXT:    bge .LBB6_2
+; ARM-NEXT:    rsbs r0, r1, #0
+; ARM-NEXT:    adcs r0, r1
+; ARM-NEXT:    lsrs r2, r3, #31
+; ARM-NEXT:    ands r2, r0
+; ARM-NEXT:    movs r0, #1
+; ARM-NEXT:    cmp r1, #0
+; ARM-NEXT:    mov r5, r0
+; ARM-NEXT:    bgt .LBB6_2
 ; ARM-NEXT:  @ %bb.1:
-; ARM-NEXT:    mov r3, r4
+; ARM-NEXT:    mov r5, r4
 ; ARM-NEXT:  .LBB6_2:
-; ARM-NEXT:    mov r6, r2
-; ARM-NEXT:    bmi .LBB6_4
+; ARM-NEXT:    orrs r5, r2
+; ARM-NEXT:    mvns r6, r4
+; ARM-NEXT:    cmp r5, #0
+; ARM-NEXT:    bne .LBB6_4
 ; ARM-NEXT:  @ %bb.3:
-; ARM-NEXT:    mov r6, r4
+; ARM-NEXT:    str r3, [sp, #16] @ 4-byte Spill
+; ARM-NEXT:    mov r5, r6
+; ARM-NEXT:    beq .LBB6_5
+; ARM-NEXT:    b .LBB6_6
 ; ARM-NEXT:  .LBB6_4:
-; ARM-NEXT:    ands r5, r6
-; ARM-NEXT:    cmp r1, #0
-; ARM-NEXT:    mov r7, r2
-; ARM-NEXT:    bgt .LBB6_6
-; ARM-NEXT:  @ %bb.5:
-; ARM-NEXT:    mov r7, r4
+; ARM-NEXT:    ldr r2, .LCPI6_0
+; ARM-NEXT:    str r2, [sp, #16] @ 4-byte Spill
+; ARM-NEXT:    mov r5, r6
+; ARM-NEXT:    bne .LBB6_6
+; ARM-NEXT:  .LBB6_5:
+; ARM-NEXT:    ldr r5, [sp] @ 4-byte Reload
 ; ARM-NEXT:  .LBB6_6:
-; ARM-NEXT:    orrs r7, r5
-; ARM-NEXT:    mvns r6, r4
-; ARM-NEXT:    cmp r7, #0
-; ARM-NEXT:    beq .LBB6_8
+; ARM-NEXT:    adds r2, r1, #1
+; ARM-NEXT:    rsbs r7, r2, #0
+; ARM-NEXT:    adcs r7, r2
+; ARM-NEXT:    cmp r3, #0
+; ARM-NEXT:    mov r3, r0
+; ARM-NEXT:    bge .LBB6_8
 ; ARM-NEXT:  @ %bb.7:
-; ARM-NEXT:    ldr r0, .LCPI6_0
-; ARM-NEXT:    str r0, [sp, #16] @ 4-byte Spill
+; ARM-NEXT:    mov r3, r4
 ; ARM-NEXT:  .LBB6_8:
-; ARM-NEXT:    mov r5, r6
-; ARM-NEXT:    bne .LBB6_10
-; ARM-NEXT:  @ %bb.9:
-; ARM-NEXT:    ldr r5, [sp] @ 4-byte Reload
-; ARM-NEXT:  .LBB6_10:
-; ARM-NEXT:    adds r0, r1, #1
-; ARM-NEXT:    rsbs r7, r0, #0
-; ARM-NEXT:    adcs r7, r0
 ; ARM-NEXT:    ands r7, r3
 ; ARM-NEXT:    cmp r1, r6
-; ARM-NEXT:    mov r3, r2
-; ARM-NEXT:    blt .LBB6_12
-; ARM-NEXT:  @ %bb.11:
+; ARM-NEXT:    mov r3, r0
+; ARM-NEXT:    blt .LBB6_10
+; ARM-NEXT:  @ %bb.9:
 ; ARM-NEXT:    mov r3, r4
-; ARM-NEXT:  .LBB6_12:
+; ARM-NEXT:  .LBB6_10:
 ; ARM-NEXT:    orrs r3, r7
-; ARM-NEXT:    lsls r1, r2, #31
+; ARM-NEXT:    lsls r1, r0, #31
 ; ARM-NEXT:    cmp r3, #0
+; ARM-NEXT:    bne .LBB6_12
+; ARM-NEXT:  @ %bb.11:
+; ARM-NEXT:    mov r4, r5
+; ARM-NEXT:  .LBB6_12:
 ; ARM-NEXT:    bne .LBB6_14
 ; ARM-NEXT:  @ %bb.13:
-; ARM-NEXT:    mov r4, r5
-; ARM-NEXT:  .LBB6_14:
-; ARM-NEXT:    bne .LBB6_16
-; ARM-NEXT:  @ %bb.15:
 ; ARM-NEXT:    ldr r1, [sp, #16] @ 4-byte Reload
-; ARM-NEXT:  .LBB6_16:
+; ARM-NEXT:  .LBB6_14:
 ; ARM-NEXT:    mov r0, r4
 ; ARM-NEXT:    add sp, #20
 ; ARM-NEXT:    pop {r4, r5, r6, r7, pc}
 ; ARM-NEXT:    .p2align 2
-; ARM-NEXT:  @ %bb.17:
+; ARM-NEXT:  @ %bb.15:
 ; ARM-NEXT:  .LCPI6_0:
 ; ARM-NEXT:    .long 2147483647 @ 0x7fffffff
   %tmp = call i64 @llvm.smul.fix.sat.i64(i64 %x, i64 %y, i32 32)
diff --git a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
index bbc0ff9bd1be5..16d450a4e3b7a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-saturating-arith.ll
@@ -34,45 +34,45 @@ entry:
 define arm_aapcs_vfpcc <2 x i64> @sadd_int64_t(<2 x i64> %src1, <2 x i64> %src2) {
 ; CHECK-LABEL: sadd_int64_t:
 ; CHECK:       @ %bb.0: @ %entry
-; CHECK-NEXT:    .save {r4, r5, r7, lr}
-; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
 ; CHECK-NEXT:    vmov r0, r1, d2
 ; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    vmov r4, r5, d1
 ; CHECK-NEXT:    adds.w r12, r2, r0
-; CHECK-NEXT:    vmov r0, r4, d1
-; CHECK-NEXT:    adc.w lr, r3, r1
+; CHECK-NEXT:    adc.w r0, r3, r1
 ; CHECK-NEXT:    subs.w r2, r12, r2
-; CHECK-NEXT:    sbcs.w r2, lr, r3
+; CHECK-NEXT:    sbcs.w r2, r0, r3
+; CHECK-NEXT:    mov.w r3, #0
 ; CHECK-NEXT:    cset r2, lt
-; CHECK-NEXT:    cmp r1, #0
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    eormi r2, r2, #1
-; CHECK-NEXT:    rsbs r1, r2, #0
-; CHECK-NEXT:    movs r2, #0
-; CHECK-NEXT:    bfi r2, r1, #0, #8
-; CHECK-NEXT:    vmov r1, r3, d3
-; CHECK-NEXT:    adds r1, r1, r0
-; CHECK-NEXT:    adc.w r5, r4, r3
-; CHECK-NEXT:    subs r0, r1, r0
-; CHECK-NEXT:    sbcs.w r0, r5, r4
-; CHECK-NEXT:    vmov q0[2], q0[0], r12, r1
-; CHECK-NEXT:    cset r0, lt
-; CHECK-NEXT:    cmp r3, #0
-; CHECK-NEXT:    it mi
-; CHECK-NEXT:    eormi r0, r0, #1
-; CHECK-NEXT:    asr.w r1, lr, #31
-; CHECK-NEXT:    rsbs r0, r0, #0
-; CHECK-NEXT:    vmov q0[3], q0[1], lr, r5
-; CHECK-NEXT:    bfi r2, r0, #8, #8
-; CHECK-NEXT:    asrs r0, r5, #31
-; CHECK-NEXT:    vmov q1[2], q1[0], r1, r0
-; CHECK-NEXT:    vmsr p0, r2
-; CHECK-NEXT:    vmov q1[3], q1[1], r1, r0
+; CHECK-NEXT:    cmp.w r3, r1, lsr #31
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne r2, r2, #1
+; CHECK-NEXT:    rsb.w lr, r2, #0
+; CHECK-NEXT:    vmov r2, r1, d3
+; CHECK-NEXT:    adds r2, r2, r4
+; CHECK-NEXT:    adc.w r6, r5, r1
+; CHECK-NEXT:    subs r4, r2, r4
+; CHECK-NEXT:    sbcs.w r4, r6, r5
+; CHECK-NEXT:    vmov q0[2], q0[0], r12, r2
+; CHECK-NEXT:    cset r4, lt
+; CHECK-NEXT:    cmp.w r3, r1, lsr #31
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    eorne r4, r4, #1
+; CHECK-NEXT:    bfi r3, lr, #0, #8
+; CHECK-NEXT:    rsbs r1, r4, #0
+; CHECK-NEXT:    vmov q0[3], q0[1], r0, r6
+; CHECK-NEXT:    bfi r3, r1, #8, #8
+; CHECK-NEXT:    asrs r1, r6, #31
+; CHECK-NEXT:    asrs r0, r0, #31
+; CHECK-NEXT:    vmsr p0, r3
+; CHECK-NEXT:    vmov q1[2], q1[0], r0, r1
+; CHECK-NEXT:    vmov q1[3], q1[1], r0, r1
 ; CHECK-NEXT:    adr r0, .LCPI3_0
 ; CHECK-NEXT:    vldrw.u32 q2, [r0]
 ; CHECK-NEXT:    veor q1, q1, q2
 ; CHECK-NEXT:    vpsel q0, q1, q0
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  @ %bb.1:
 ; CHECK-NEXT:  .LCPI3_0:
diff --git a/llvm/test/CodeGen/WebAssembly/atomic-fence.ll b/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
index 8fed309bcce00..5df28f2cdcd8d 100644
--- a/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
+++ b/llvm/test/CodeGen/WebAssembly/atomic-fence.ll
@@ -1,36 +1,140 @@
-; RUN: llc < %s | FileCheck %s --check-prefix NOATOMIC
-; RUN: llc < %s -asm-verbose=false -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+atomics | FileCheck %s
-
-target triple = "wasm32-unknown-unknown"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s --mtriple=wasm32 | FileCheck %s --check-prefixes=NO-ATOMICS
+; RUN: llc < %s --mtriple=wasm32 -mattr=+atomics | FileCheck %s --check-prefixes=ATOMICS
+; RUN: llc < %s --mtriple=wasm32 -mattr=+atomics,+relaxed-atomics | FileCheck %s --check-prefixes=RELAXED32
+; RUN: llc < %s --mtriple=wasm64 -mattr=+atomics,+relaxed-atomics | FileCheck %s --check-prefixes=RELAXED64
 
 ; A multithread fence is lowered to an atomic.fence instruction.
-; CHECK-LABEL: multithread_fence:
-; CHECK:  atomic.fence
-; NOATOMIC-NOT: i32.atomic.rmw.or
-define void @multithread_fence() {
-  fence seq_cst
+define void @fence_acquire() {
+; NO-ATOMICS-LABEL: fence_acquire:
+; NO-ATOMICS:         .functype fence_acquire () -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: fence_acquire:
+; ATOMICS:         .functype fence_acquire () -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    atomic.fence
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: fence_acquire:
+; RELAXED32:         .functype fence_acquire () -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    atomic.fence acqrel
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: fence_acquire:
+; RELAXED64:         .functype fence_acquire () -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    atomic.fence acqrel
+; RELAXED64-NEXT:    # fallthrough-return
+  fence acquire
   ret void
 }
 
-; Fences with weaker memory orderings than seq_cst should be treated the same
-; because atomic memory access in wasm are sequentially consistent.
-; CHECK-LABEL: multithread_weak_fence:
-; CHECK:       atomic.fence
-; CHECK-NEXT:  atomic.fence
-; CHECK-NEXT:  atomic.fence
-define void @multithread_weak_fence() {
-  fence acquire
+define void @fence_release() {
+; NO-ATOMICS-LABEL: fence_release:
+; NO-ATOMICS:         .functype fence_release () -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: fence_release:
+; ATOMICS:         .functype fence_release () -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    atomic.fence
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: fence_release:
+; RELAXED32:         .functype fence_release () -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    atomic.fence acqrel
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: fence_release:
+; RELAXED64:         .functype fence_release () -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    atomic.fence acqrel
+; RELAXED64-NEXT:    # fallthrough-return
   fence release
+  ret void
+}
+
+define void @fence_acq_rel() {
+; NO-ATOMICS-LABEL: fence_acq_rel:
+; NO-ATOMICS:         .functype fence_acq_rel () -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: fence_acq_rel:
+; ATOMICS:         .functype fence_acq_rel () -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    atomic.fence
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: fence_acq_rel:
+; RELAXED32:         .functype fence_acq_rel () -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    atomic.fence acqrel
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: fence_acq_rel:
+; RELAXED64:         .functype fence_acq_rel () -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    atomic.fence acqrel
+; RELAXED64-NEXT:    # fallthrough-return
   fence acq_rel
   ret void
 }
 
+define void @fence_seq_cst() {
+; NO-ATOMICS-LABEL: fence_seq_cst:
+; NO-ATOMICS:         .functype fence_seq_cst () -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: fence_seq_cst:
+; ATOMICS:         .functype fence_seq_cst () -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    atomic.fence
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: fence_seq_cst:
+; RELAXED32:         .functype fence_seq_cst () -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    atomic.fence seqcst
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: fence_seq_cst:
+; RELAXED64:         .functype fence_seq_cst () -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    atomic.fence seqcst
+; RELAXED64-NEXT:    # fallthrough-return
+  fence seq_cst
+  ret void
+}
+
 ; A singlethread fence becomes compiler_fence instruction, a pseudo instruction
 ; that acts as a compiler barrier. The barrier should not be emitted to .s file.
-; CHECK-LABEL: singlethread_fence:
-; CHECK-NOT: compiler_fence
-; CHECK-NOT: atomic_fence
 define void @singlethread_fence() {
+; NO-ATOMICS-LABEL: singlethread_fence:
+; NO-ATOMICS:         .functype singlethread_fence () -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: singlethread_fence:
+; ATOMICS:         .functype singlethread_fence () -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: singlethread_fence:
+; RELAXED32:         .functype singlethread_fence () -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: singlethread_fence:
+; RELAXED64:         .functype singlethread_fence () -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    # fallthrough-return
   fence syncscope("singlethread") seq_cst
   fence syncscope("singlethread") acquire
   fence syncscope("singlethread") release
diff --git a/llvm/test/CodeGen/WebAssembly/atomic-mem-consistency.ll b/llvm/test/CodeGen/WebAssembly/atomic-mem-consistency.ll
index 5e9a0060c6ece..116c5fbba34d9 100644
--- a/llvm/test/CodeGen/WebAssembly/atomic-mem-consistency.ll
+++ b/llvm/test/CodeGen/WebAssembly/atomic-mem-consistency.ll
@@ -1,9 +1,13 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+atomics,+sign-ext | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s --mtriple=wasm32 | FileCheck %s --check-prefixes=NO-ATOMICS
+; RUN: llc < %s --mtriple=wasm32 -mattr=+atomics | FileCheck %s --check-prefixes=ATOMICS
+; RUN: llc < %s --mtriple=wasm32 -mattr=+atomics,+relaxed-atomics | FileCheck %s --check-prefixes=RELAXED32
+; RUN: llc < %s --mtriple=wasm64 -mattr=+atomics,+relaxed-atomics | FileCheck %s --check-prefixes=RELAXED64
 
-; Currently all wasm atomic memory access instructions are sequentially
-; consistent, so even if LLVM IR specifies weaker orderings than that, we
-; should upgrade them to sequential ordering and treat them in the same way.
+; Currently Wasm supports a constrained set of atomic memory orderings.
+; Originally it supported only sequential consistency, but now it also
+; supports relaxed atomics. Weaker orderings in LLVM IR are "upgraded" to
+; the next supported ordering.
 
 target triple = "wasm32-unknown-unknown"
 
@@ -13,34 +17,130 @@ target triple = "wasm32-unknown-unknown"
 
 ; The 'release' and 'acq_rel' orderings are not valid on load instructions.
 
-; CHECK-LABEL: load_i32_unordered:
-; CHECK: i32.atomic.load $push0=, 0($0){{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @load_i32_unordered(ptr %p) {
+; NO-ATOMICS-LABEL: load_i32_unordered:
+; NO-ATOMICS:         .functype load_i32_unordered (i32) -> (i32)
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: load_i32_unordered:
+; ATOMICS:         .functype load_i32_unordered (i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    i32.atomic.load 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: load_i32_unordered:
+; RELAXED32:         .functype load_i32_unordered (i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    i32.atomic.load acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: load_i32_unordered:
+; RELAXED64:         .functype load_i32_unordered (i64) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    i32.atomic.load acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %v = load atomic i32, ptr %p unordered, align 4
   ret i32 %v
 }
 
-; CHECK-LABEL: load_i32_monotonic:
-; CHECK: i32.atomic.load $push0=, 0($0){{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @load_i32_monotonic(ptr %p) {
+; NO-ATOMICS-LABEL: load_i32_monotonic:
+; NO-ATOMICS:         .functype load_i32_monotonic (i32) -> (i32)
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: load_i32_monotonic:
+; ATOMICS:         .functype load_i32_monotonic (i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    i32.atomic.load 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: load_i32_monotonic:
+; RELAXED32:         .functype load_i32_monotonic (i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    i32.atomic.load acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: load_i32_monotonic:
+; RELAXED64:         .functype load_i32_monotonic (i64) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    i32.atomic.load acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %v = load atomic i32, ptr %p monotonic, align 4
   ret i32 %v
 }
 
-; CHECK-LABEL: load_i32_acquire:
-; CHECK: i32.atomic.load $push0=, 0($0){{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @load_i32_acquire(ptr %p) {
+; NO-ATOMICS-LABEL: load_i32_acquire:
+; NO-ATOMICS:         .functype load_i32_acquire (i32) -> (i32)
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: load_i32_acquire:
+; ATOMICS:         .functype load_i32_acquire (i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    i32.atomic.load 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: load_i32_acquire:
+; RELAXED32:         .functype load_i32_acquire (i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    i32.atomic.load acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: load_i32_acquire:
+; RELAXED64:         .functype load_i32_acquire (i64) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    i32.atomic.load acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %v = load atomic i32, ptr %p acquire, align 4
   ret i32 %v
 }
 
-; CHECK-LABEL: load_i32_seq_cst:
-; CHECK: i32.atomic.load $push0=, 0($0){{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @load_i32_seq_cst(ptr %p) {
+; NO-ATOMICS-LABEL: load_i32_seq_cst:
+; NO-ATOMICS:         .functype load_i32_seq_cst (i32) -> (i32)
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: load_i32_seq_cst:
+; ATOMICS:         .functype load_i32_seq_cst (i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    i32.atomic.load 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: load_i32_seq_cst:
+; RELAXED32:         .functype load_i32_seq_cst (i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    i32.atomic.load seqcst 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: load_i32_seq_cst:
+; RELAXED64:         .functype load_i32_seq_cst (i64) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    i32.atomic.load seqcst 0
+; RELAXED64-NEXT:    # fallthrough-return
   %v = load atomic i32, ptr %p seq_cst, align 4
   ret i32 %v
 }
@@ -51,38 +151,146 @@ define i32 @load_i32_seq_cst(ptr %p) {
 
 ; The 'acquire' and 'acq_rel' orderings aren’t valid on store instructions.
 
-; CHECK-LABEL: store_i32_unordered:
-; CHECK-NEXT: .functype store_i32_unordered (i32, i32) -> (){{$}}
-; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
-; CHECK-NEXT: return{{$}}
 define void @store_i32_unordered(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: store_i32_unordered:
+; NO-ATOMICS:         .functype store_i32_unordered (i32, i32) -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: store_i32_unordered:
+; ATOMICS:         .functype store_i32_unordered (i32, i32) -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.store 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: store_i32_unordered:
+; RELAXED32:         .functype store_i32_unordered (i32, i32) -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.store acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: store_i32_unordered:
+; RELAXED64:         .functype store_i32_unordered (i64, i32) -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.store acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   store atomic i32 %v, ptr %p unordered, align 4
   ret void
 }
 
-; CHECK-LABEL: store_i32_monotonic:
-; CHECK-NEXT: .functype store_i32_monotonic (i32, i32) -> (){{$}}
-; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
-; CHECK-NEXT: return{{$}}
 define void @store_i32_monotonic(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: store_i32_monotonic:
+; NO-ATOMICS:         .functype store_i32_monotonic (i32, i32) -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: store_i32_monotonic:
+; ATOMICS:         .functype store_i32_monotonic (i32, i32) -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.store 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: store_i32_monotonic:
+; RELAXED32:         .functype store_i32_monotonic (i32, i32) -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.store acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: store_i32_monotonic:
+; RELAXED64:         .functype store_i32_monotonic (i64, i32) -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.store acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   store atomic i32 %v, ptr %p monotonic, align 4
   ret void
 }
 
-; CHECK-LABEL: store_i32_release:
-; CHECK-NEXT: .functype store_i32_release (i32, i32) -> (){{$}}
-; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
-; CHECK-NEXT: return{{$}}
 define void @store_i32_release(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: store_i32_release:
+; NO-ATOMICS:         .functype store_i32_release (i32, i32) -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: store_i32_release:
+; ATOMICS:         .functype store_i32_release (i32, i32) -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.store 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: store_i32_release:
+; RELAXED32:         .functype store_i32_release (i32, i32) -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.store acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: store_i32_release:
+; RELAXED64:         .functype store_i32_release (i64, i32) -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.store acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   store atomic i32 %v, ptr %p release, align 4
   ret void
 }
 
-; CHECK-LABEL: store_i32_seq_cst:
-; CHECK-NEXT: .functype store_i32_seq_cst (i32, i32) -> (){{$}}
-; CHECK-NEXT: i32.atomic.store 0($0), $1{{$}}
-; CHECK-NEXT: return{{$}}
 define void @store_i32_seq_cst(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: store_i32_seq_cst:
+; NO-ATOMICS:         .functype store_i32_seq_cst (i32, i32) -> ()
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: store_i32_seq_cst:
+; ATOMICS:         .functype store_i32_seq_cst (i32, i32) -> ()
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.store 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: store_i32_seq_cst:
+; RELAXED32:         .functype store_i32_seq_cst (i32, i32) -> ()
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.store seqcst 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: store_i32_seq_cst:
+; RELAXED64:         .functype store_i32_seq_cst (i64, i32) -> ()
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.store seqcst 0
+; RELAXED64-NEXT:    # fallthrough-return
   store atomic i32 %v, ptr %p seq_cst, align 4
   ret void
 }
@@ -94,47 +302,212 @@ define void @store_i32_seq_cst(ptr %p, i32 %v) {
 ; Out of several binary RMW instructions, here we test 'add' as an example.
 ; The 'unordered' ordering is not valid on atomicrmw instructions.
 
-; CHECK-LABEL: add_i32_monotonic:
-; CHECK-NEXT: .functype add_i32_monotonic (i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_monotonic(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: add_i32_monotonic:
+; NO-ATOMICS:         .functype add_i32_monotonic (i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 2
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.add
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: add_i32_monotonic:
+; ATOMICS:         .functype add_i32_monotonic (i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.rmw.add 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: add_i32_monotonic:
+; RELAXED32:         .functype add_i32_monotonic (i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: add_i32_monotonic:
+; RELAXED64:         .functype add_i32_monotonic (i64, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %old = atomicrmw add ptr %p, i32 %v monotonic
   ret i32 %old
 }
 
-; CHECK-LABEL: add_i32_acquire:
-; CHECK-NEXT: .functype add_i32_acquire (i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_acquire(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: add_i32_acquire:
+; NO-ATOMICS:         .functype add_i32_acquire (i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 2
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.add
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: add_i32_acquire:
+; ATOMICS:         .functype add_i32_acquire (i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.rmw.add 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: add_i32_acquire:
+; RELAXED32:         .functype add_i32_acquire (i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: add_i32_acquire:
+; RELAXED64:         .functype add_i32_acquire (i64, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %old = atomicrmw add ptr %p, i32 %v acquire
   ret i32 %old
 }
 
-; CHECK-LABEL: add_i32_release:
-; CHECK-NEXT: .functype add_i32_release (i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_release(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: add_i32_release:
+; NO-ATOMICS:         .functype add_i32_release (i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 2
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.add
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: add_i32_release:
+; ATOMICS:         .functype add_i32_release (i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.rmw.add 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: add_i32_release:
+; RELAXED32:         .functype add_i32_release (i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: add_i32_release:
+; RELAXED64:         .functype add_i32_release (i64, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %old = atomicrmw add ptr %p, i32 %v release
   ret i32 %old
 }
 
-; CHECK-LABEL: add_i32_acq_rel:
-; CHECK-NEXT: .functype add_i32_acq_rel (i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_acq_rel(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: add_i32_acq_rel:
+; NO-ATOMICS:         .functype add_i32_acq_rel (i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 2
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.add
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: add_i32_acq_rel:
+; ATOMICS:         .functype add_i32_acq_rel (i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.rmw.add 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: add_i32_acq_rel:
+; RELAXED32:         .functype add_i32_acq_rel (i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: add_i32_acq_rel:
+; RELAXED64:         .functype add_i32_acq_rel (i64, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.rmw.add acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %old = atomicrmw add ptr %p, i32 %v acq_rel
   ret i32 %old
 }
 
-; CHECK-LABEL: add_i32_seq_cst:
-; CHECK-NEXT: .functype add_i32_seq_cst (i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.add $push0=, 0($0), $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @add_i32_seq_cst(ptr %p, i32 %v) {
+; NO-ATOMICS-LABEL: add_i32_seq_cst:
+; NO-ATOMICS:         .functype add_i32_seq_cst (i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 2
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.add
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: add_i32_seq_cst:
+; ATOMICS:         .functype add_i32_seq_cst (i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    i32.atomic.rmw.add 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: add_i32_seq_cst:
+; RELAXED32:         .functype add_i32_seq_cst (i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    i32.atomic.rmw.add seqcst 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: add_i32_seq_cst:
+; RELAXED64:         .functype add_i32_seq_cst (i64, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    i32.atomic.rmw.add seqcst 0
+; RELAXED64-NEXT:    # fallthrough-return
   %old = atomicrmw add ptr %p, i32 %v seq_cst
   ret i32 %old
 }
@@ -145,81 +518,393 @@ define i32 @add_i32_seq_cst(ptr %p, i32 %v) {
 ; least monotonic, the ordering constraint on failure must be no stronger than
 ; that on success, and the failure ordering cannot be either release or acq_rel.
 
-; CHECK-LABEL: cmpxchg_i32_monotonic_monotonic:
-; CHECK-NEXT: .functype cmpxchg_i32_monotonic_monotonic (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_monotonic_monotonic(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_monotonic_monotonic:
+; NO-ATOMICS:         .functype cmpxchg_i32_monotonic_monotonic (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_monotonic_monotonic:
+; ATOMICS:         .functype cmpxchg_i32_monotonic_monotonic (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_monotonic_monotonic:
+; RELAXED32:         .functype cmpxchg_i32_monotonic_monotonic (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_monotonic_monotonic:
+; RELAXED64:         .functype cmpxchg_i32_monotonic_monotonic (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new monotonic monotonic
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_acquire_monotonic:
-; CHECK-NEXT: .functype cmpxchg_i32_acquire_monotonic (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acquire_monotonic(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_acquire_monotonic:
+; NO-ATOMICS:         .functype cmpxchg_i32_acquire_monotonic (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_acquire_monotonic:
+; ATOMICS:         .functype cmpxchg_i32_acquire_monotonic (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_acquire_monotonic:
+; RELAXED32:         .functype cmpxchg_i32_acquire_monotonic (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_acquire_monotonic:
+; RELAXED64:         .functype cmpxchg_i32_acquire_monotonic (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new acquire monotonic
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_release_monotonic:
-; CHECK-NEXT: .functype cmpxchg_i32_release_monotonic (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_release_monotonic(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_release_monotonic:
+; NO-ATOMICS:         .functype cmpxchg_i32_release_monotonic (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_release_monotonic:
+; ATOMICS:         .functype cmpxchg_i32_release_monotonic (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_release_monotonic:
+; RELAXED32:         .functype cmpxchg_i32_release_monotonic (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_release_monotonic:
+; RELAXED64:         .functype cmpxchg_i32_release_monotonic (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new release monotonic
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_acq_rel_monotonic:
-; CHECK-NEXT: .functype cmpxchg_i32_acq_rel_monotonic (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acq_rel_monotonic(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_acq_rel_monotonic:
+; NO-ATOMICS:         .functype cmpxchg_i32_acq_rel_monotonic (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_acq_rel_monotonic:
+; ATOMICS:         .functype cmpxchg_i32_acq_rel_monotonic (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_acq_rel_monotonic:
+; RELAXED32:         .functype cmpxchg_i32_acq_rel_monotonic (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_acq_rel_monotonic:
+; RELAXED64:         .functype cmpxchg_i32_acq_rel_monotonic (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new acq_rel monotonic
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_seq_cst_monotonic:
-; CHECK-NEXT: .functype cmpxchg_i32_seq_cst_monotonic (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_seq_cst_monotonic(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_seq_cst_monotonic:
+; NO-ATOMICS:         .functype cmpxchg_i32_seq_cst_monotonic (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_seq_cst_monotonic:
+; ATOMICS:         .functype cmpxchg_i32_seq_cst_monotonic (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_seq_cst_monotonic:
+; RELAXED32:         .functype cmpxchg_i32_seq_cst_monotonic (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg seqcst 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_seq_cst_monotonic:
+; RELAXED64:         .functype cmpxchg_i32_seq_cst_monotonic (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg seqcst 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new seq_cst monotonic
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_acquire_acquire:
-; CHECK-NEXT: .functype cmpxchg_i32_acquire_acquire (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acquire_acquire(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_acquire_acquire:
+; NO-ATOMICS:         .functype cmpxchg_i32_acquire_acquire (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_acquire_acquire:
+; ATOMICS:         .functype cmpxchg_i32_acquire_acquire (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_acquire_acquire:
+; RELAXED32:         .functype cmpxchg_i32_acquire_acquire (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_acquire_acquire:
+; RELAXED64:         .functype cmpxchg_i32_acquire_acquire (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new acquire acquire
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_release_acquire:
-; CHECK-NEXT: .functype cmpxchg_i32_release_acquire (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_release_acquire(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_release_acquire:
+; NO-ATOMICS:         .functype cmpxchg_i32_release_acquire (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_release_acquire:
+; ATOMICS:         .functype cmpxchg_i32_release_acquire (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_release_acquire:
+; RELAXED32:         .functype cmpxchg_i32_release_acquire (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_release_acquire:
+; RELAXED64:         .functype cmpxchg_i32_release_acquire (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new release acquire
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_acq_rel_acquire:
-; CHECK-NEXT: .functype cmpxchg_i32_acq_rel_acquire (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_acq_rel_acquire(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_acq_rel_acquire:
+; NO-ATOMICS:         .functype cmpxchg_i32_acq_rel_acquire (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_acq_rel_acquire:
+; ATOMICS:         .functype cmpxchg_i32_acq_rel_acquire (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_acq_rel_acquire:
+; RELAXED32:         .functype cmpxchg_i32_acq_rel_acquire (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_acq_rel_acquire:
+; RELAXED64:         .functype cmpxchg_i32_acq_rel_acquire (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg acqrel 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new acq_rel acquire
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
@@ -230,16 +915,98 @@ define i32 @cmpxchg_i32_acq_rel_acquire(ptr %p, i32 %exp, i32 %new) {
 ; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_seq_cst_acquire(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_seq_cst_acquire:
+; NO-ATOMICS:         .functype cmpxchg_i32_seq_cst_acquire (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_seq_cst_acquire:
+; ATOMICS:         .functype cmpxchg_i32_seq_cst_acquire (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_seq_cst_acquire:
+; RELAXED32:         .functype cmpxchg_i32_seq_cst_acquire (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg seqcst 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_seq_cst_acquire:
+; RELAXED64:         .functype cmpxchg_i32_seq_cst_acquire (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg seqcst 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new seq_cst acquire
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
 }
 
-; CHECK-LABEL: cmpxchg_i32_seq_cst_seq_cst:
-; CHECK-NEXT: .functype cmpxchg_i32_seq_cst_seq_cst (i32, i32, i32) -> (i32){{$}}
-; CHECK: i32.atomic.rmw.cmpxchg $push0=, 0($0), $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
 define i32 @cmpxchg_i32_seq_cst_seq_cst(ptr %p, i32 %exp, i32 %new) {
+; NO-ATOMICS-LABEL: cmpxchg_i32_seq_cst_seq_cst:
+; NO-ATOMICS:         .functype cmpxchg_i32_seq_cst_seq_cst (i32, i32, i32) -> (i32)
+; NO-ATOMICS-NEXT:    .local i32
+; NO-ATOMICS-NEXT:  # %bb.0:
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    local.get 2
+; NO-ATOMICS-NEXT:    local.get 0
+; NO-ATOMICS-NEXT:    i32.load 0
+; NO-ATOMICS-NEXT:    local.tee 3
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    local.get 1
+; NO-ATOMICS-NEXT:    i32.eq
+; NO-ATOMICS-NEXT:    i32.select
+; NO-ATOMICS-NEXT:    i32.store 0
+; NO-ATOMICS-NEXT:    local.get 3
+; NO-ATOMICS-NEXT:    # fallthrough-return
+;
+; ATOMICS-LABEL: cmpxchg_i32_seq_cst_seq_cst:
+; ATOMICS:         .functype cmpxchg_i32_seq_cst_seq_cst (i32, i32, i32) -> (i32)
+; ATOMICS-NEXT:  # %bb.0:
+; ATOMICS-NEXT:    local.get 0
+; ATOMICS-NEXT:    local.get 1
+; ATOMICS-NEXT:    local.get 2
+; ATOMICS-NEXT:    i32.atomic.rmw.cmpxchg 0
+; ATOMICS-NEXT:    # fallthrough-return
+;
+; RELAXED32-LABEL: cmpxchg_i32_seq_cst_seq_cst:
+; RELAXED32:         .functype cmpxchg_i32_seq_cst_seq_cst (i32, i32, i32) -> (i32)
+; RELAXED32-NEXT:  # %bb.0:
+; RELAXED32-NEXT:    local.get 0
+; RELAXED32-NEXT:    local.get 1
+; RELAXED32-NEXT:    local.get 2
+; RELAXED32-NEXT:    i32.atomic.rmw.cmpxchg seqcst 0
+; RELAXED32-NEXT:    # fallthrough-return
+;
+; RELAXED64-LABEL: cmpxchg_i32_seq_cst_seq_cst:
+; RELAXED64:         .functype cmpxchg_i32_seq_cst_seq_cst (i64, i32, i32) -> (i32)
+; RELAXED64-NEXT:  # %bb.0:
+; RELAXED64-NEXT:    local.get 0
+; RELAXED64-NEXT:    local.get 1
+; RELAXED64-NEXT:    local.get 2
+; RELAXED64-NEXT:    i32.atomic.rmw.cmpxchg seqcst 0
+; RELAXED64-NEXT:    # fallthrough-return
   %pair = cmpxchg ptr %p, i32 %exp, i32 %new seq_cst seq_cst
   %old = extractvalue { i32, i1 } %pair, 0
   ret i32 %old
diff --git a/llvm/test/CodeGen/WebAssembly/fast-isel.ll b/llvm/test/CodeGen/WebAssembly/fast-isel.ll
index 8f1761b3eb667..a05fe4874fb2e 100644
--- a/llvm/test/CodeGen/WebAssembly/fast-isel.ll
+++ b/llvm/test/CodeGen/WebAssembly/fast-isel.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -asm-verbose=false \
-; RUN:   -fast-isel -fast-isel-abort=1 -verify-machineinstrs \
-; RUN:   -wasm-disable-explicit-locals -wasm-keep-registers \
+; RUN:   -fast-isel -fast-isel-abort=3 -verify-machineinstrs \
+; RUN:   -wasm-disable-explicit-locals -wasm-keep-registers -O0 \
 ; RUN:   | FileCheck %s
 
 target triple = "wasm32-unknown-unknown"
@@ -76,3 +76,18 @@ bb:
   %tmp2 = load i64, ptr %tmp, align 8
   ret i64 %tmp2
 }
+
+; CHECK-LABEL: br:
+; CHECK: br_if
+; CHECK: br
+; CHECK: f32.const $push{{[0-9]+}}=, 0x1.4p1{{$}}
+define float @br(i32 %a) {
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %block1, label %block2
+block1:
+  br label %block3
+block2:
+  br label %block3
+block3:
+  ret float 2.5
+}
diff --git a/llvm/test/CodeGen/WebAssembly/offset-atomics.ll b/llvm/test/CodeGen/WebAssembly/offset-atomics.ll
index 466b82e982c30..c1475955f83da 100644
--- a/llvm/test/CodeGen/WebAssembly/offset-atomics.ll
+++ b/llvm/test/CodeGen/WebAssembly/offset-atomics.ll
@@ -41,6 +41,26 @@ define i32 @load_i32_with_folded_gep_offset(ptr %p) {
   ret i32 %t
 }
 
+; Same for nusw.
+
+; CHECK-LABEL: load_i32_with_folded_gep_offset_nusw:
+; CHECK: i32.atomic.load  $push0=, 24($0){{$}}
+define i32 @load_i32_with_folded_gep_offset_nusw(ptr %p) {
+  %s = getelementptr nusw i32, ptr %p, i32 6
+  %t = load atomic i32, ptr %s seq_cst, align 4
+  ret i32 %t
+}
+
+; For nuw we don't need the offset to be positive.
+
+; CHECK-LABEL: load_i32_with_folded_gep_offset_nuw:
+; CHECK: i32.atomic.load  $push0=, -24($0){{$}}
+define i32 @load_i32_with_folded_gep_offset_nuw(ptr %p) {
+  %s = getelementptr nuw i32, ptr %p, i32 -6
+  %t = load atomic i32, ptr %s seq_cst, align 4
+  ret i32 %t
+}
+
 ; We can't fold a negative offset though, even with an inbounds gep.
 
 ; CHECK-LABEL: load_i32_with_unfolded_gep_negative_offset:
@@ -99,8 +119,19 @@ define i32 @load_i32_from_global_address() {
   ret i32 %t
 }
 
+define i32 @load_i32_global_with_folded_gep_offset_nonconst_nuw(i32 %idx) {
+; CHECK-LABEL: load_i32_global_with_folded_gep_offset_nonconst_nuw:
+; CHECK: i32.const $push0=, 2
+; CHECK: i32.shl $push1=, $0, $pop0
+; CHECK: i32.atomic.load $push2=, gv($pop1)
+  %s = getelementptr nuw i32, ptr @gv, i32 %idx
+  %t = load atomic i32, ptr %s seq_cst, align 4
+  ret i32 %t
+}
+
 ;===----------------------------------------------------------------------------
 ; Atomic loads: 64-bit
+
 ;===----------------------------------------------------------------------------
 
 ; Basic load.
diff --git a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
index c2a43a839b1e5..b39ce48bf0980 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-bitmask.ll
@@ -196,3 +196,123 @@ define i32 @bitmask_v32i8(<32 x i8> %v) {
   %bitmask = bitcast <32 x i1> %cmp to i32
   ret i32 %bitmask
 }
+
+define i32 @manual_bitmask_i8x16(<16 x i8> %v) {
+; CHECK-LABEL: manual_bitmask_i8x16:
+; CHECK:         .functype manual_bitmask_i8x16 (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.bitmask
+; CHECK-NEXT:    # fallthrough-return
+  %1 = icmp slt <16 x i8> %v, zeroinitializer
+  %2 = bitcast <16 x i1> %1 to i16
+  %3 = zext i16 %2 to i32
+  ret i32 %3
+}
+
+define i32 @manual_bitmask_i16x8(<8 x i16> %v) {
+; CHECK-LABEL: manual_bitmask_i16x8:
+; CHECK:         .functype manual_bitmask_i16x8 (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.bitmask
+; CHECK-NEXT:    # fallthrough-return
+  %1 = icmp slt <8 x i16> %v, zeroinitializer
+  %2 = bitcast <8 x i1> %1 to i8
+  %3 = zext i8 %2 to i32
+  ret i32 %3
+}
+
+define i32 @manual_bitmask_i32x4(<4 x i32> %v) {
+; CHECK-LABEL: manual_bitmask_i32x4:
+; CHECK:         .functype manual_bitmask_i32x4 (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.bitmask
+; CHECK-NEXT:    # fallthrough-return
+  %1 = icmp slt <4 x i32> %v, zeroinitializer
+  %2 = bitcast <4 x i1> %1 to i4
+  %3 = zext i4 %2 to i32
+  ret i32 %3
+}
+
+define i32 @manual_bitmask_i64x2(<2 x i64> %v) {
+; CHECK-LABEL: manual_bitmask_i64x2:
+; CHECK:         .functype manual_bitmask_i64x2 (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.bitmask
+; CHECK-NEXT:    # fallthrough-return
+  %1 = icmp slt <2 x i64> %v, zeroinitializer
+  %2 = bitcast <2 x i1> %1 to i2
+  %3 = zext i2 %2 to i32
+  ret i32 %3
+}
+
+define i32 @manual_bitmask_v8i8(<8 x i8> %v) {
+; CHECK-LABEL: manual_bitmask_v8i8:
+; CHECK:         .functype manual_bitmask_v8i8 (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.const 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    i8x16.lt_s
+; CHECK-NEXT:    i16x8.extend_low_i8x16_s
+; CHECK-NEXT:    i16x8.bitmask
+; CHECK-NEXT:    # fallthrough-return
+  %1 = icmp slt <8 x i8> %v, zeroinitializer
+  %2 = bitcast <8 x i1> %1 to i8
+  %3 = zext i8 %2 to i32
+  ret i32 %3
+}
+
+define i32 @manual_bitmask_v32i8(<32 x i8> %v) {
+; CHECK-LABEL: manual_bitmask_v32i8:
+; CHECK:         .functype manual_bitmask_v32i8 (v128, v128) -> (i32)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.bitmask
+; CHECK-NEXT:    i32.const 16
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    i32.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.bitmask
+; CHECK-NEXT:    i32.add
+; CHECK-NEXT:    # fallthrough-return
+  %1 = icmp slt <32 x i8> %v, zeroinitializer
+  %2 = bitcast <32 x i1> %1 to i32
+  ret i32 %2
+}
+
+define i64 @manual_bitmask_v64i8(<64 x i8> %v) {
+; CHECK-LABEL: manual_bitmask_v64i8:
+; CHECK:         .functype manual_bitmask_v64i8 (v128, v128, v128, v128) -> (i64)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i64.const 16
+; CHECK-NEXT:    i64.const 16
+; CHECK-NEXT:    i64.const 16
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.bitmask
+; CHECK-NEXT:    i64.extend_i32_u
+; CHECK-NEXT:    i64.const 16
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    i64.shl
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    i8x16.bitmask
+; CHECK-NEXT:    i64.extend_i32_u
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    i64.shl
+; CHECK-NEXT:    local.get 2
+; CHECK-NEXT:    i8x16.bitmask
+; CHECK-NEXT:    i64.extend_i32_u
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    i64.shl
+; CHECK-NEXT:    local.get 3
+; CHECK-NEXT:    i8x16.bitmask
+; CHECK-NEXT:    i64.extend_i32_u
+; CHECK-NEXT:    i64.add
+; CHECK-NEXT:    # fallthrough-return
+  %1 = icmp slt <64 x i8> %v, zeroinitializer
+  %2 = bitcast <64 x i1> %1 to i64
+  ret i64 %2
+}
diff --git a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
index 94aa197bfd564..25dc1efedfb95 100644
--- a/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
+++ b/llvm/test/CodeGen/WebAssembly/wide-simd-mul.ll
@@ -195,3 +195,213 @@ define <8 x i32> @zext_sext_mul_v8i16(<8 x i16> %a, <8 x i16> %b) {
   %mul = mul <8 x i32> %wide.a, %wide.b
   ret <8 x i32> %mul
 }
+
+define <4 x i32> @sext_mul_v8i16_with_symmetric_constant_vector(<8 x i16> %v) {
+; CHECK-LABEL: sext_mul_v8i16_with_symmetric_constant_vector:
+; CHECK:         .functype sext_mul_v8i16_with_symmetric_constant_vector (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const $push0=, 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s $push1=, $0, $pop0
+; CHECK-NEXT:    return $pop1
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @sext_mul_v8i16_with_symmetric_constant_vector_comm(<8 x i16> %v) {
+; CHECK-LABEL: sext_mul_v8i16_with_symmetric_constant_vector_comm:
+; CHECK:         .functype sext_mul_v8i16_with_symmetric_constant_vector_comm (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const $push0=, 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s $push1=, $0, $pop0
+; CHECK-NEXT:    return $pop1
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>, %sext
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %3, %2
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @sext_mul_v8i16_with_constant(<8 x i16> %v) {
+; CHECK-LABEL: sext_mul_v8i16_with_constant:
+; CHECK:         .functype sext_mul_v8i16_with_constant (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const $push0=, 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s $push1=, $0, $pop0
+; CHECK-NEXT:    return $pop1
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> %sext, <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}
+
+define <4 x i32> @sext_mul_v8i16_with_constant_comm(<8 x i16> %v) {
+; CHECK-LABEL: sext_mul_v8i16_with_constant_comm:
+; CHECK:         .functype sext_mul_v8i16_with_constant_comm (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    v128.const $push0=, 4096, 1, 4096, 1, 4096, 1, 4096, 1
+; CHECK-NEXT:    i32x4.dot_i16x8_s $push1=, $0, $pop0
+; CHECK-NEXT:    return $pop1
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> <i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1, i32 4096, i32 1>, %sext
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %3, %2
+  ret <4 x i32> %4
+}
+
+; Negative - unsupported type
+define <4 x i32> @sext_mul_v8i1_pow2(<8 x i1> %v) {
+; CHECK-LABEL: sext_mul_v8i1_pow2:
+; CHECK:         .functype sext_mul_v8i1_pow2 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push3=, $0
+; CHECK-NEXT:    i32.const $push1=, 31
+; CHECK-NEXT:    i32x4.shl $push4=, $pop3, $pop1
+; CHECK-NEXT:    i32.const $push14=, 31
+; CHECK-NEXT:    i32x4.shr_s $push13=, $pop4, $pop14
+; CHECK-NEXT:    local.tee $push12=, $1=, $pop13
+; CHECK-NEXT:    i32x4.extend_high_i16x8_u $push0=, $0
+; CHECK-NEXT:    i32.const $push11=, 31
+; CHECK-NEXT:    i32x4.shl $push2=, $pop0, $pop11
+; CHECK-NEXT:    i32.const $push10=, 31
+; CHECK-NEXT:    i32x4.shr_s $push9=, $pop2, $pop10
+; CHECK-NEXT:    local.tee $push8=, $0=, $pop9
+; CHECK-NEXT:    i8x16.shuffle $push6=, $pop12, $pop8, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i8x16.shuffle $push5=, $1, $0, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i32x4.add $push7=, $pop6, $pop5
+; CHECK-NEXT:    return $pop7
+  %sext = sext <8 x i1> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, %sext
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %3, %2
+  ret <4 x i32> %4
+}
+
+; Negative - unsupported type
+define <4 x i32> @sext_mul_v8i8_pow2(<8 x i8> %v) {
+; CHECK-LABEL: sext_mul_v8i8_pow2:
+; CHECK:         .functype sext_mul_v8i8_pow2 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i16x8.extend_low_i8x16_s $push17=, $0
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push18=, $pop17
+; CHECK-NEXT:    v128.const $push19=, 0, 1, 2, 4
+; CHECK-NEXT:    i32x4.mul $push28=, $pop18, $pop19
+; CHECK-NEXT:    local.tee $push27=, $1=, $pop28
+; CHECK-NEXT:    i8x16.shuffle $push0=, $0, $0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    i16x8.extend_low_i8x16_s $push1=, $pop0
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push26=, $pop1
+; CHECK-NEXT:    local.tee $push25=, $0=, $pop26
+; CHECK-NEXT:    i32x4.extract_lane $push5=, $pop25, 0
+; CHECK-NEXT:    i32.const $push6=, 3
+; CHECK-NEXT:    i32.shl $push7=, $pop5, $pop6
+; CHECK-NEXT:    i32x4.splat $push8=, $pop7
+; CHECK-NEXT:    i32x4.extract_lane $push2=, $0, 1
+; CHECK-NEXT:    i32.const $push3=, 4
+; CHECK-NEXT:    i32.shl $push4=, $pop2, $pop3
+; CHECK-NEXT:    i32x4.replace_lane $push9=, $pop8, 1, $pop4
+; CHECK-NEXT:    i32x4.extract_lane $push10=, $0, 2
+; CHECK-NEXT:    i32.const $push11=, 5
+; CHECK-NEXT:    i32.shl $push12=, $pop10, $pop11
+; CHECK-NEXT:    i32x4.replace_lane $push13=, $pop9, 2, $pop12
+; CHECK-NEXT:    i32x4.extract_lane $push14=, $0, 3
+; CHECK-NEXT:    i32.const $push15=, 6
+; CHECK-NEXT:    i32.shl $push16=, $pop14, $pop15
+; CHECK-NEXT:    i32x4.replace_lane $push24=, $pop13, 3, $pop16
+; CHECK-NEXT:    local.tee $push23=, $0=, $pop24
+; CHECK-NEXT:    i8x16.shuffle $push21=, $pop27, $pop23, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i8x16.shuffle $push20=, $1, $0, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i32x4.add $push22=, $pop21, $pop20
+; CHECK-NEXT:    return $pop22
+  %sext = sext <8 x i8> %v to <8 x i32>
+  %1 = mul nsw <8 x i32> <i32 0, i32 1, i32 2, i32 4, i32 8, i32 16, i32 32, i32 64>, %sext
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %3, %2
+  ret <4 x i32> %4
+}
+
+; Negative - shifts by 15 overflow
+define <4 x i32> @combine_with_shl_signed_non_overflow(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_shl_signed_non_overflow:
+; CHECK:         .functype combine_with_shl_signed_non_overflow (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32x4.extend_low_i16x8_s $push24=, $0
+; CHECK-NEXT:    local.tee $push23=, $1=, $pop24
+; CHECK-NEXT:    i32x4.extract_lane $push8=, $1, 1
+; CHECK-NEXT:    i32.const $push1=, 15
+; CHECK-NEXT:    i32.shl $push9=, $pop8, $pop1
+; CHECK-NEXT:    i32x4.replace_lane $push10=, $pop23, 1, $pop9
+; CHECK-NEXT:    i32x4.extract_lane $push6=, $1, 3
+; CHECK-NEXT:    i32.const $push22=, 15
+; CHECK-NEXT:    i32.shl $push7=, $pop6, $pop22
+; CHECK-NEXT:    i32x4.replace_lane $push21=, $pop10, 3, $pop7
+; CHECK-NEXT:    local.tee $push20=, $1=, $pop21
+; CHECK-NEXT:    i32x4.extend_high_i16x8_s $push19=, $0
+; CHECK-NEXT:    local.tee $push18=, $0=, $pop19
+; CHECK-NEXT:    i32x4.extract_lane $push3=, $0, 1
+; CHECK-NEXT:    i32.const $push17=, 15
+; CHECK-NEXT:    i32.shl $push4=, $pop3, $pop17
+; CHECK-NEXT:    i32x4.replace_lane $push5=, $pop18, 1, $pop4
+; CHECK-NEXT:    i32x4.extract_lane $push0=, $0, 3
+; CHECK-NEXT:    i32.const $push16=, 15
+; CHECK-NEXT:    i32.shl $push2=, $pop0, $pop16
+; CHECK-NEXT:    i32x4.replace_lane $push15=, $pop5, 3, $pop2
+; CHECK-NEXT:    local.tee $push14=, $0=, $pop15
+; CHECK-NEXT:    i8x16.shuffle $push12=, $pop20, $pop14, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle $push11=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    i32x4.add $push13=, $pop12, $pop11
+; CHECK-NEXT:    return $pop13
+  %sext = sext <8 x i16> %v to <8 x i32>
+  %1 = mul <8 x i32> %sext, <i32 1, i32 32768, i32 1, i32 32768, i32 1, i32 32768, i32 1, i32 32768>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}
+
+; Negative - shifts by 16 overflow
+define <4 x i32> @combine_with_shl_unsigned_non_overflow(<8 x i16> %v) {
+; CHECK-LABEL: combine_with_shl_unsigned_non_overflow:
+; CHECK:         .functype combine_with_shl_unsigned_non_overflow (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    i32x4.extend_low_i16x8_u $push24=, $0
+; CHECK-NEXT:    local.tee $push23=, $1=, $pop24
+; CHECK-NEXT:    i32x4.extract_lane $push8=, $1, 1
+; CHECK-NEXT:    i32.const $push1=, 16
+; CHECK-NEXT:    i32.shl $push9=, $pop8, $pop1
+; CHECK-NEXT:    i32x4.replace_lane $push10=, $pop23, 1, $pop9
+; CHECK-NEXT:    i32x4.extract_lane $push6=, $1, 3
+; CHECK-NEXT:    i32.const $push22=, 16
+; CHECK-NEXT:    i32.shl $push7=, $pop6, $pop22
+; CHECK-NEXT:    i32x4.replace_lane $push21=, $pop10, 3, $pop7
+; CHECK-NEXT:    local.tee $push20=, $1=, $pop21
+; CHECK-NEXT:    i32x4.extend_high_i16x8_u $push19=, $0
+; CHECK-NEXT:    local.tee $push18=, $0=, $pop19
+; CHECK-NEXT:    i32x4.extract_lane $push3=, $0, 1
+; CHECK-NEXT:    i32.const $push17=, 16
+; CHECK-NEXT:    i32.shl $push4=, $pop3, $pop17
+; CHECK-NEXT:    i32x4.replace_lane $push5=, $pop18, 1, $pop4
+; CHECK-NEXT:    i32x4.extract_lane $push0=, $0, 3
+; CHECK-NEXT:    i32.const $push16=, 16
+; CHECK-NEXT:    i32.shl $push2=, $pop0, $pop16
+; CHECK-NEXT:    i32x4.replace_lane $push15=, $pop5, 3, $pop2
+; CHECK-NEXT:    local.tee $push14=, $0=, $pop15
+; CHECK-NEXT:    i8x16.shuffle $push12=, $pop20, $pop14, 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK-NEXT:    i8x16.shuffle $push11=, $1, $0, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK-NEXT:    v128.or $push13=, $pop12, $pop11
+; CHECK-NEXT:    return $pop13
+  %zext = zext <8 x i16> %v to <8 x i32>
+  %1 = mul <8 x i32> %zext, <i32 1, i32 65536, i32 1, i32 65536, i32 1, i32 65536, i32 1, i32 65536>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %3 = shufflevector <8 x i32> %1, <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %4 = add <4 x i32> %2, %3
+  ret <4 x i32> %4
+}
diff --git a/llvm/test/CodeGen/X86/GlobalISel/constant.ll b/llvm/test/CodeGen/X86/GlobalISel/constant.ll
index 0ef0270cf3dfc..ac80b546cb330 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/constant.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/constant.ll
@@ -42,6 +42,15 @@ define i64 @const_i64_u32() {
   ret i64 1879048192
 }
 
+;i64 value fits into u32 but not i32
+define i64 @const_i64_u32_max() {
+; CHECK-LABEL: const_i64_u32_max:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $4294967295, %eax # imm = 0xFFFFFFFF
+; CHECK-NEXT:    retq
+  ret i64 4294967295
+}
+
 ;i64 value fit into i32
 define i64 @const_i64_i32() {
 ; CHECK-LABEL: const_i64_i32:
@@ -54,7 +63,7 @@ define i64 @const_i64_i32() {
 define void @main(ptr %data) {
 ; CHECK-LABEL: main:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq $0, %rax
+; CHECK-NEXT:    movl $0, %eax
 ; CHECK-NEXT:    movq %rax, (%rdi)
 ; CHECK-NEXT:    retq
   store ptr null, ptr %data, align 8
diff --git a/llvm/test/CodeGen/X86/GlobalISel/select-constant.mir b/llvm/test/CodeGen/X86/GlobalISel/select-constant.mir
index eef6ba61e5591..adb80c93952f5 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/select-constant.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/select-constant.mir
@@ -26,6 +26,10 @@
     ret i64 1879048192
   }
 
+  define i64 @const_i64_u32_max() {
+    ret i64 4294967295
+  }
+
   define i64 @const_i64_i32() {
     ret i64 -1
   }
@@ -143,6 +147,24 @@ body:             |
     $rax = COPY %0(s64)
     RET 0, implicit $rax
 
+...
+---
+name:            const_i64_u32_max
+legalized:       true
+regBankSelected: true
+selected:        false
+registers:
+  - { id: 0, class: gpr }
+body:             |
+  bb.1 (%ir-block.0):
+    ; CHECK-LABEL: name: const_i64_u32_max
+    ; CHECK: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 4294967295
+    ; CHECK-NEXT: $rax = COPY [[MOV32ri64_]]
+    ; CHECK-NEXT: RET 0, implicit $rax
+    %0(s64) = G_CONSTANT i64 4294967295
+    $rax = COPY %0(s64)
+    RET 0, implicit $rax
+
 ...
 ---
 name:            const_i64_i32
@@ -178,8 +200,8 @@ body:             |
     ; CHECK: liveins: $rdi
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rdi
-    ; CHECK-NEXT: [[MOV64ri32_:%[0-9]+]]:gr64 = MOV64ri32 0
-    ; CHECK-NEXT: MOV64mr [[COPY]], 1, $noreg, 0, $noreg, [[MOV64ri32_]] :: (store (p0) into %ir.data)
+    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 0
+    ; CHECK-NEXT: MOV64mr [[COPY]], 1, $noreg, 0, $noreg, [[MOV32ri64_]] :: (store (p0) into %ir.data)
     ; CHECK-NEXT: RET 0
     %0(p0) = COPY $rdi
     %1(p0) = G_CONSTANT i64 0
diff --git a/llvm/test/CodeGen/X86/apx/adc.ll b/llvm/test/CodeGen/X86/apx/adc.ll
index ec9800ddc69ae..5a38a3d413a0a 100644
--- a/llvm/test/CodeGen/X86/apx/adc.ll
+++ b/llvm/test/CodeGen/X86/apx/adc.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i8 @adc8rr(i8 %a, i8 %b, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8rr:
@@ -54,11 +57,19 @@ define i64 @adc64rr(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 }
 
 define i8 @adc8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: adc8rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
-; CHECK-NEXT:    adcb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x12,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc8rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; NDD-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; NDD-NEXT:    {evex} adcb (%rsi), %al # encoding: [0x62,0xf4,0x7c,0x08,0x12,0x06]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc8rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; MEM-NEXT:    adcb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x12,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
   %s = add i8 %a, %b
   %k = icmp ugt i8 %x, %y
@@ -68,11 +79,19 @@ define i8 @adc8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 }
 
 define i16 @adc16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: adc16rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
-; CHECK-NEXT:    adcw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x13,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc16rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; NDD-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; NDD-NEXT:    {evex} adcw (%rsi), %ax # encoding: [0x62,0xf4,0x7d,0x08,0x13,0x06]
+; NDD-NEXT:    # kill: def $ax killed $ax killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc16rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; MEM-NEXT:    adcw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x13,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
   %s = add i16 %a, %b
   %k = icmp ugt i16 %x, %y
@@ -82,11 +101,18 @@ define i16 @adc16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @adc32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: adc32rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
-; CHECK-NEXT:    adcl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x13,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc32rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; NDD-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; NDD-NEXT:    {evex} adcl (%rsi), %eax # encoding: [0x62,0xf4,0x7c,0x08,0x13,0x06]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc32rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; MEM-NEXT:    adcl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x13,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
   %s = add i32 %a, %b
   %k = icmp ugt i32 %x, %y
@@ -96,11 +122,18 @@ define i32 @adc32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @adc64rm(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: adc64rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
-; CHECK-NEXT:    adcq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x13,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc64rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; NDD-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; NDD-NEXT:    {evex} adcq (%rsi), %rax # encoding: [0x62,0xf4,0xfc,0x08,0x13,0x06]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc64rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; MEM-NEXT:    adcq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x13,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
   %s = add i64 %a, %b
   %k = icmp ugt i64 %x, %y
@@ -204,11 +237,18 @@ define i64 @adc64ri(i64 %a, i64 %x, i64 %y) nounwind {
 }
 
 define i8 @adc8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: adc8mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
-; CHECK-NEXT:    adcb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x12,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc8mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; NDD-NEXT:    adcb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x10,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc8mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; MEM-NEXT:    adcb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x12,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
   %s = add i8 %b, %a
   %k = icmp ugt i8 %x, %y
@@ -218,11 +258,18 @@ define i8 @adc8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 }
 
 define i16 @adc16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: adc16mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
-; CHECK-NEXT:    adcw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x13,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc16mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; NDD-NEXT:    adcw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x11,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc16mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; MEM-NEXT:    adcw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x13,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
   %s = add i16 %b, %a
   %k = icmp ugt i16 %x, %y
@@ -232,11 +279,18 @@ define i16 @adc16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @adc32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: adc32mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
-; CHECK-NEXT:    adcl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x13,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc32mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; NDD-NEXT:    adcl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x11,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc32mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; MEM-NEXT:    adcl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x13,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
   %s = add i32 %b, %a
   %k = icmp ugt i32 %x, %y
@@ -246,11 +300,18 @@ define i32 @adc32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @adc64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: adc64mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
-; CHECK-NEXT:    adcq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x13,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc64mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; NDD-NEXT:    adcq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x11,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc64mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; MEM-NEXT:    adcq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x13,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
   %s = add i64 %b, %a
   %k = icmp ugt i64 %x, %y
@@ -260,11 +321,18 @@ define i64 @adc64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 }
 
 define i16 @adc16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: adc16mi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    adcw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x17,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc16mi8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; NDD-NEXT:    adcw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xd0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc16mi8:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; MEM-NEXT:    adcw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x17,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = add i16 %a, 123
   %k = icmp ugt i16 %x, %y
@@ -274,11 +342,18 @@ define i16 @adc16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @adc32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: adc32mi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
-; CHECK-NEXT:    adcl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x17,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc32mi8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; NDD-NEXT:    adcl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xd0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc32mi8:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; MEM-NEXT:    adcl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x17,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %s = add i32 %a, 123
   %k = icmp ugt i32 %x, %y
@@ -288,11 +363,18 @@ define i32 @adc32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @adc64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: adc64mi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
-; CHECK-NEXT:    adcq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x17,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc64mi8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; NDD-NEXT:    adcq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xd0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc64mi8:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; MEM-NEXT:    adcq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x17,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %s = add i64 %a, 123
   %k = icmp ugt i64 %x, %y
@@ -302,11 +384,18 @@ define i64 @adc64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
 }
 
 define i8 @adc8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: adc8mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
-; CHECK-NEXT:    adcb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x17,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc8mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
+; NDD-NEXT:    adcb $123, %al # EVEX TO LEGACY Compression encoding: [0x14,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc8mi:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
+; MEM-NEXT:    adcb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x17,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %s = add i8 %a, 123
   %k = icmp ugt i8 %x, %y
@@ -316,12 +405,20 @@ define i8 @adc8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
 }
 
 define i16 @adc16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: adc16mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    adcw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x17,0xd2,0x04]
-; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc16mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; NDD-NEXT:    adcw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x15,0xd2,0x04]
+; NDD-NEXT:    # imm = 0x4D2
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc16mi:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; MEM-NEXT:    adcw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x17,0xd2,0x04]
+; MEM-NEXT:    # imm = 0x4D2
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = add i16 %a, 1234
   %k = icmp ugt i16 %x, %y
@@ -331,12 +428,20 @@ define i16 @adc16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @adc32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: adc32mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
-; CHECK-NEXT:    adcl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x17,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc32mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; NDD-NEXT:    adcl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x15,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc32mi:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; MEM-NEXT:    adcl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x17,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %s = add i32 %a, 123456
   %k = icmp ugt i32 %x, %y
@@ -346,12 +451,20 @@ define i32 @adc32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @adc64mi(ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: adc64mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
-; CHECK-NEXT:    adcq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x17,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: adc64mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; NDD-NEXT:    adcq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x15,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: adc64mi:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; MEM-NEXT:    adcq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x17,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %s = add i64 %a, 123456
   %k = icmp ugt i64 %x, %y
diff --git a/llvm/test/CodeGen/X86/apx/add.ll b/llvm/test/CodeGen/X86/apx/add.ll
index 4ab0edfba7ce8..ffb2535e9f17b 100644
--- a/llvm/test/CodeGen/X86/apx/add.ll
+++ b/llvm/test/CodeGen/X86/apx/add.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,MEM,BOTH
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
@@ -64,14 +67,27 @@ entry:
 }
 
 define i8 @add8rm(i8 noundef %a, ptr %ptr) {
-; CHECK-LABEL: add8rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add8rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    addb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add8rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    addb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add8rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add8rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x02,0x3e]
+; NF-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NF-NEXT:    addb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i8, ptr %ptr
@@ -80,14 +96,27 @@ entry:
 }
 
 define i16 @add16rm(i16 noundef %a, ptr %ptr) {
-; CHECK-LABEL: add16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add16rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x03,0x3e]
+; NF-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NF-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i16, ptr %ptr
@@ -96,14 +125,27 @@ entry:
 }
 
 define i32 @add32rm(i32 noundef %a, ptr %ptr) {
-; CHECK-LABEL: add32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    addl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add32rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    addl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x03,0x3e]
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    addl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i32, ptr %ptr
@@ -112,14 +154,27 @@ entry:
 }
 
 define i64 @add64rm(i64 noundef %a, ptr %ptr) {
-; CHECK-LABEL: add64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add64rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x03,0x3e]
+; NF-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NF-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i64, ptr %ptr
@@ -235,14 +290,27 @@ entry:
 }
 
 define i8 @add8mr(ptr %a, i8 noundef %b) {
-; CHECK-LABEL: add8mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add8mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    addb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add8mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    addb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add8mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add8mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x02,0x37]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    addb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -251,14 +319,27 @@ entry:
 }
 
 define i16 @add16mr(ptr %a, i16 noundef %b) {
-; CHECK-LABEL: add16mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add16mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    addw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add16mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    addw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add16mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x03,0x37]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -267,14 +348,27 @@ entry:
 }
 
 define i32 @add32mr(ptr %a, i32 noundef %b) {
-; CHECK-LABEL: add32mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add32mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    addl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add32mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    addl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add32mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add32mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl (%rdi), %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x03,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    addl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -283,14 +377,27 @@ entry:
 }
 
 define i64 @add64mr(ptr %a, i64 noundef %b) {
-; CHECK-LABEL: add64mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add64mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    addq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add64mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    addq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add64mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq (%rdi), %rsi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x03,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    addq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -299,14 +406,27 @@ entry:
 }
 
 define i16 @add16mi8(ptr %a) {
-; CHECK-LABEL: add16mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x07,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add16mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    addw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add16mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    addw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add16mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x07,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x07,0x7b]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -315,14 +435,27 @@ entry:
 }
 
 define i32 @add32mi8(ptr %a) {
-; CHECK-LABEL: add32mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x07,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add32mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add32mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add32mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x07,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add32mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x07,0x7b]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -331,14 +464,33 @@ entry:
 }
 
 define i64 @add64mi8(ptr %a) {
-; CHECK-LABEL: add64mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x07,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add64mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    addq $123, %rax # encoding: [0x48,0x83,0xc0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add64mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    addq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: add64mi8:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; MEMONLY-NEXT:    addq $123, %rax # encoding: [0x48,0x83,0xc0,0x7b]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; BOTH-LABEL: add64mi8:
+; BOTH:       # %bb.0: # %entry
+; BOTH-NEXT:    addq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x07,0x7b]
+; BOTH-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x07,0x7b]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    addq $123, %rax # encoding: [0x48,0x83,0xc0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -347,14 +499,27 @@ entry:
 }
 
 define i8 @add8mi(ptr %a) {
-; CHECK-LABEL: add8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    addb $123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add8mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    addb $123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add8mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x7b]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    addb $123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -363,15 +528,30 @@ entry:
 }
 
 define i16 @add16mi(ptr %a) {
-; CHECK-LABEL: add16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x07,0xd2,0x04]
-; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    addw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0xd2,0x04]
+; NDD-NEXT:    # imm = 0x4D2
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add16mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    addw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0xd2,0x04]
+; IMMONLY-NEXT:    # imm = 0x4D2
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x07,0xd2,0x04]
+; MEM-NEXT:    # imm = 0x4D2
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x07,0xd2,0x04]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -381,15 +561,30 @@ entry:
 }
 
 define i32 @add32mi(ptr %a) {
-; CHECK-LABEL: add32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    addl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add32mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    addl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x40,0xe2,0x01,0x00]
+; IMMONLY-NEXT:    # imm = 0x1E240
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add32mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    addl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -399,15 +594,30 @@ entry:
 }
 
 define i64 @add64mi(ptr %a) {
-; CHECK-LABEL: add64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    addq $123456, %rax # encoding: [0x48,0x05,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: add64mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    addq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0x40,0xe2,0x01,0x00]
+; IMMONLY-NEXT:    # imm = 0x1E240
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: add64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x07,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x07,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    addq $123456, %rax # encoding: [0x48,0x05,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -506,18 +716,39 @@ entry:
 }
 
 define i8 @addflag8rm(i8 noundef %a, ptr %b) {
-; CHECK-LABEL: addflag8rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
-; CHECK-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
-; CHECK-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
-; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: addflag8rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    addb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf8]
+; NDD-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NDD-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; NDD-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: addflag8rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    addb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf8]
+; IMMONLY-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; IMMONLY-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; IMMONLY-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    # kill: def $al killed $al killed $eax
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: addflag8rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
+; MEM-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; MEM-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
+; MEM-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; MEM-NEXT:    # kill: def $al killed $al killed $eax
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: addflag8rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    addb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x02,0x3e]
+; NF-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NF-NEXT:    addb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x00,0xf8]
 ; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
 ; NF-NEXT:    movl $255, %ecx # encoding: [0xb9,0xff,0x00,0x00,0x00]
 ; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
@@ -530,18 +761,39 @@ entry:
 }
 
 define i16 @addflag16rm(i16 noundef %a, ptr %b) {
-; CHECK-LABEL: addflag16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
-; CHECK-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
-; CHECK-NEXT:    # imm = 0xFFFF
-; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: addflag16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
+; NDD-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; NDD-NEXT:    # imm = 0xFFFF
+; NDD-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NDD-NEXT:    # kill: def $ax killed $ax killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: addflag16rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
+; IMMONLY-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; IMMONLY-NEXT:    # imm = 0xFFFF
+; IMMONLY-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    # kill: def $ax killed $ax killed $eax
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: addflag16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
+; MEM-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
+; MEM-NEXT:    # imm = 0xFFFF
+; MEM-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; MEM-NEXT:    # kill: def $ax killed $ax killed $eax
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: addflag16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    addw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x03,0x3e]
+; NF-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NF-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
 ; NF-NEXT:    movl $65535, %ecx # encoding: [0xb9,0xff,0xff,0x00,0x00]
 ; NF-NEXT:    # imm = 0xFFFF
 ; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
@@ -554,16 +806,33 @@ entry:
 }
 
 define i32 @addflag32rm(i32 noundef %a, ptr %b) {
-; CHECK-LABEL: addflag32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
-; CHECK-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: addflag32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    addl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf8]
+; NDD-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: addflag32rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    addl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf8]
+; IMMONLY-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; IMMONLY-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: addflag32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
+; MEM-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
+; MEM-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: addflag32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    addl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x03,0x3e]
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    addl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf8]
 ; NF-NEXT:    movl $-1, %ecx # encoding: [0xb9,0xff,0xff,0xff,0xff]
 ; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; NF-NEXT:    retq # encoding: [0xc3]
@@ -574,16 +843,33 @@ entry:
 }
 
 define i64 @addflag64rm(i64 noundef %a, ptr %b) {
-; CHECK-LABEL: addflag64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
-; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: addflag64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
+; NDD-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: addflag64rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
+; IMMONLY-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; IMMONLY-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: addflag64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
+; MEM-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; MEM-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: addflag64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    addq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x03,0x3e]
+; NF-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NF-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
 ; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
 ; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; NF-NEXT:    retq # encoding: [0xc3]
@@ -636,18 +922,32 @@ entry:
 }
 
 define i64 @addflag64ri8(i64 noundef %a) {
-; CHECK-LABEL: addflag64ri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
-; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: addflag64ri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    addq $123, %rdi # encoding: [0x48,0x83,0xc7,0x7b]
+; NDD-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: addflag64ri8:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
+; IMM-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; IMM-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: addflag64ri8:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    addq $123, %rdi # encoding: [0x48,0x83,0xc7,0x7b]
+; MEMONLY-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
+; MEMONLY-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: addflag64ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    addq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xc7,0x7b]
-; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
-; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    addq $123, %rdi # encoding: [0x48,0x83,0xc7,0x7b]
+; NF-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123)
@@ -724,20 +1024,36 @@ entry:
 }
 
 define i64 @addflag64ri(i64 noundef %a) {
-; CHECK-LABEL: addflag64ri:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
-; CHECK-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: addflag64ri:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    addq $123456, %rdi # encoding: [0x48,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
+; NDD-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: addflag64ri:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; IMM-NEXT:    # imm = 0x1E240
+; IMM-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
+; IMM-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: addflag64ri:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    addq $123456, %rdi # encoding: [0x48,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; MEMONLY-NEXT:    # imm = 0x1E240
+; MEMONLY-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
+; MEMONLY-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: addflag64ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    addq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xc7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    addq $123456, %rdi # encoding: [0x48,0x81,0xc7,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
-; NF-NEXT:    movq $-1, %rcx # encoding: [0x48,0xc7,0xc1,0xff,0xff,0xff,0xff]
-; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NF-NEXT:    movq $-1, %rax # encoding: [0x48,0xc7,0xc0,0xff,0xff,0xff,0xff]
+; NF-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %add = call i64 @llvm.uadd.sat.i64(i64 %a, i64 123456)
@@ -747,22 +1063,42 @@ entry:
 @val = external dso_local global i16, align 4
 
 define i1 @add64ri_reloc(i16 %k) {
-; CHECK-LABEL: add64ri_reloc:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7]
-; CHECK-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
-; CHECK-NEXT:    addq $val, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
-; CHECK-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: add64ri_reloc:
+; NDD:       # %bb.0:
+; NDD-NEXT:    # kill: def $edi killed $edi def $rdi
+; NDD-NEXT:    movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7]
+; NDD-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; NDD-NEXT:    addq $val, %rax # encoding: [0x48,0x05,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
+; NDD-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: add64ri_reloc:
+; IMM:       # %bb.0:
+; IMM-NEXT:    # kill: def $edi killed $edi def $rdi
+; IMM-NEXT:    movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7]
+; IMM-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; IMM-NEXT:    addq $val, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,A,A,A,A]
+; IMM-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
+; IMM-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: add64ri_reloc:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    # kill: def $edi killed $edi def $rdi
+; MEMONLY-NEXT:    movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7]
+; MEMONLY-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; MEMONLY-NEXT:    addq $val, %rax # encoding: [0x48,0x05,A,A,A,A]
+; MEMONLY-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
+; MEMONLY-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: add64ri_reloc:
 ; NF:       # %bb.0:
 ; NF-NEXT:    # kill: def $edi killed $edi def $rdi
 ; NF-NEXT:    movswq %di, %rax # encoding: [0x48,0x0f,0xbf,0xc7]
 ; NF-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
-; NF-NEXT:    addq $val, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,A,A,A,A]
+; NF-NEXT:    addq $val, %rax # encoding: [0x48,0x05,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: val, kind: reloc_signed_4byte
 ; NF-NEXT:    setne %al # encoding: [0x0f,0x95,0xc0]
 ; NF-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll
index b442260f503b2..c4c9b632b8a80 100644
--- a/llvm/test/CodeGen/X86/apx/and.ll
+++ b/llvm/test/CodeGen/X86/apx/and.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,MEM
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
@@ -68,14 +71,27 @@ entry:
 }
 
 define i8 @and8rm(i8 noundef %a, ptr %b) {
-; CHECK-LABEL: and8rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x22,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and8rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    andb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x20,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and8rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    andb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x20,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and8rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x22,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and8rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x22,0x3e]
+; NF-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NF-NEXT:    andb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x20,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
@@ -84,14 +100,27 @@ entry:
 }
 
 define i16 @and16rm(i16 noundef %a, ptr %b) {
-; CHECK-LABEL: and16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x23,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    andw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and16rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    andw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x23,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x23,0x3e]
+; NF-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NF-NEXT:    andw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
@@ -100,14 +129,27 @@ entry:
 }
 
 define i32 @and32rm(i32 noundef %a, ptr %b) {
-; CHECK-LABEL: and32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    andl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x21,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and32rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    andl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x21,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x23,0x3e]
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    andl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x21,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -116,14 +158,27 @@ entry:
 }
 
 define i64 @and64rm(i64 noundef %a, ptr %b) {
-; CHECK-LABEL: and64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x23,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    andq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and64rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    andq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x23,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x23,0x3e]
+; NF-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NF-NEXT:    andq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -243,14 +298,27 @@ entry:
 }
 
 define i8 @and8mr(ptr %a, i8 noundef %b) {
-; CHECK-LABEL: and8mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x22,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and8mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    andb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x20,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and8mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    andb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x20,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and8mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x22,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and8mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x22,0x37]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    andb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x20,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -259,14 +327,27 @@ entry:
 }
 
 define i16 @and16mr(ptr %a, i16 noundef %b) {
-; CHECK-LABEL: and16mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x23,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and16mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    andw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and16mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    andw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and16mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x23,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x23,0x37]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    andw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -275,14 +356,27 @@ entry:
 }
 
 define i32 @and32mr(ptr %a, i32 noundef %b) {
-; CHECK-LABEL: and32mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and32mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    andl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x21,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and32mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    andl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x21,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and32mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x23,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and32mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl (%rdi), %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x23,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    andl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x21,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -291,14 +385,27 @@ entry:
 }
 
 define i64 @and64mr(ptr %a, i64 noundef %b) {
-; CHECK-LABEL: and64mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x23,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and64mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    andq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and64mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    andq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and64mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x23,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and64mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andq (%rdi), %rsi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x23,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    andq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -307,14 +414,27 @@ entry:
 }
 
 define i16 @and16mi8(ptr %a) {
-; CHECK-LABEL: and16mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x27,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and16mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    andw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xe0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and16mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    andw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xe0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and16mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x27,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x27,0x7b]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    andw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xe0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -323,14 +443,27 @@ entry:
 }
 
 define i32 @and32mi8(ptr %a) {
-; CHECK-LABEL: and32mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x27,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and32mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and32mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and32mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x27,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and32mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x27,0x7b]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    andl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -357,14 +490,27 @@ entry:
 }
 
 define i8 @and8mi(ptr %a) {
-; CHECK-LABEL: and8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x27,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    andb $123, %al # EVEX TO LEGACY Compression encoding: [0x24,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and8mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    andb $123, %al # EVEX TO LEGACY Compression encoding: [0x24,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x27,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and8mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x27,0x7b]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    andb $123, %al # EVEX TO LEGACY Compression encoding: [0x24,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -373,15 +519,30 @@ entry:
 }
 
 define i16 @and16mi(ptr %a) {
-; CHECK-LABEL: and16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x27,0xd2,0x04]
-; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    andw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x25,0xd2,0x04]
+; NDD-NEXT:    # imm = 0x4D2
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and16mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    andw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x25,0xd2,0x04]
+; IMMONLY-NEXT:    # imm = 0x4D2
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x27,0xd2,0x04]
+; MEM-NEXT:    # imm = 0x4D2
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x27,0xd2,0x04]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    andw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x25,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -391,15 +552,30 @@ entry:
 }
 
 define i32 @and32mi(ptr %a) {
-; CHECK-LABEL: and32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x27,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: and32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: and32mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00]
+; IMMONLY-NEXT:    # imm = 0x1E240
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: and32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x27,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and32mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x27,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    andl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x25,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -525,19 +701,40 @@ define i1 @andflag64rr(i64 %a, i64 %b) {
 }
 
 define i1 @andflag8rm(ptr %ptr, i8 %b) {
-; CHECK-LABEL: andflag8rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
-; CHECK-NEXT:    andb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x22,0x07]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: andflag8rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    notb %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0xf6,0xd6]
+; NDD-NEXT:    andb %al, %cl # EVEX TO LEGACY Compression encoding: [0x20,0xc1]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: andflag8rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    notb %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0xf6,0xd6]
+; IMMONLY-NEXT:    andb %al, %cl # EVEX TO LEGACY Compression encoding: [0x20,0xc1]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: andflag8rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; MEM-NEXT:    andb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x22,0x07]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: andflag8rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
-; NF-NEXT:    andb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x22,0x07]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    notb %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0xf6,0xd6]
+; NF-NEXT:    andb %al, %cl # EVEX TO LEGACY Compression encoding: [0x20,0xc1]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
@@ -551,19 +748,40 @@ define i1 @andflag8rm(ptr %ptr, i8 %b) {
 }
 
 define i1 @andflag16rm(ptr %ptr, i16 %b) {
-; CHECK-LABEL: andflag16rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
-; CHECK-NEXT:    andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: andflag16rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    notw %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0xf7,0xd6]
+; NDD-NEXT:    andw %ax, %cx # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xc1]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: andflag16rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    notw %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0xf7,0xd6]
+; IMMONLY-NEXT:    andw %ax, %cx # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xc1]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: andflag16rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
+; MEM-NEXT:    andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: andflag16rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
-; NF-NEXT:    andw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x23,0x07]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    notw %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0xf7,0xd6]
+; NF-NEXT:    andw %ax, %cx # EVEX TO LEGACY Compression encoding: [0x66,0x21,0xc1]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
@@ -577,17 +795,36 @@ define i1 @andflag16rm(ptr %ptr, i16 %b) {
 }
 
 define i1 @andflag32rm(ptr %ptr, i32 %b) {
-; CHECK-LABEL: andflag32rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x23,0x37]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: andflag32rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    andl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x21,0xc6]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: andflag32rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    andl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x21,0xc6]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: andflag32rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    andl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x23,0x37]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: andflag32rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    andl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x23,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    andl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x21,0xc6]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
@@ -600,17 +837,36 @@ define i1 @andflag32rm(ptr %ptr, i32 %b) {
 }
 
 define i1 @andflag64rm(ptr %ptr, i64 %b) {
-; CHECK-LABEL: andflag64rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x23,0x37]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: andflag64rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    andq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xc6]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: andflag64rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    andq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xc6]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: andflag64rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    andq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x23,0x37]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: andflag64rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    andq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x23,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    andq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x21,0xc6]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
@@ -695,18 +951,36 @@ define i1 @andflag32ri(i32 %a) {
 }
 
 define i1 @andflag64ri(i64 %a) {
-; CHECK-LABEL: andflag64ri:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: andflag64ri:
+; NDD:       # %bb.0:
+; NDD-NEXT:    andq $123456, %rdi # encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: andflag64ri:
+; IMM:       # %bb.0:
+; IMM-NEXT:    andq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; IMM-NEXT:    # imm = 0x1E240
+; IMM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMM-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; IMM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: andflag64ri:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    andq $123456, %rdi # encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; MEMONLY-NEXT:    # imm = 0x1E240
+; MEMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEMONLY-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; MEMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: andflag64ri:
 ; NF:       # %bb.0:
-; NF-NEXT:    andq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    andq $123456, %rdi # encoding: [0x48,0x81,0xe7,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
@@ -764,17 +1038,33 @@ define i1 @andflag32ri8(i32 %a) {
 }
 
 define i1 @andflag64ri8(i64 %a) {
-; CHECK-LABEL: andflag64ri8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    andq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xe7,0x7b]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: andflag64ri8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    andq $123, %rdi # encoding: [0x48,0x83,0xe7,0x7b]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: andflag64ri8:
+; IMM:       # %bb.0:
+; IMM-NEXT:    andq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xe7,0x7b]
+; IMM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMM-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; IMM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: andflag64ri8:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    andq $123, %rdi # encoding: [0x48,0x83,0xe7,0x7b]
+; MEMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEMONLY-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; MEMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: andflag64ri8:
 ; NF:       # %bb.0:
-; NF-NEXT:    andq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xe7,0x7b]
+; NF-NEXT:    andq $123, %rdi # encoding: [0x48,0x83,0xe7,0x7b]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
diff --git a/llvm/test/CodeGen/X86/apx/cmov.ll b/llvm/test/CodeGen/X86/apx/cmov.ll
index 7b846120d3f72..cbac65c275541 100644
--- a/llvm/test/CodeGen/X86/apx/cmov.ll
+++ b/llvm/test/CodeGen/X86/apx/cmov.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -x86-cmov-converter=false -show-mc-encoding -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i8 @cmov8(i8 %a, i8 %b, i8 %x, ptr %y.ptr) {
 ; CHECK-LABEL: cmov8:
@@ -20,13 +23,22 @@ entry:
 }
 
 define i16 @cmov16(i16 %a, i16 %b, i16 %x, ptr %y.ptr) {
-; CHECK-LABEL: cmov16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpw %si, %di # encoding: [0x66,0x39,0xf7]
-; CHECK-NEXT:    cmovbel %edx, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xfa]
-; CHECK-NEXT:    cmovaw (%rcx), %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x47,0x11]
-; CHECK-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: cmov16:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    cmpw %si, %di # encoding: [0x66,0x39,0xf7]
+; NDD-NEXT:    movzwl (%rcx), %eax # encoding: [0x0f,0xb7,0x01]
+; NDD-NEXT:    cmovbel %edx, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xfa]
+; NDD-NEXT:    cmovbew %dx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x46,0xc2]
+; NDD-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: cmov16:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    cmpw %si, %di # encoding: [0x66,0x39,0xf7]
+; MEM-NEXT:    cmovbel %edx, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xfa]
+; MEM-NEXT:    cmovaw (%rcx), %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x47,0x11]
+; MEM-NEXT:    addw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xf8]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %cond = icmp ugt i16 %a, %b
   %y = load i16, ptr %y.ptr
@@ -37,13 +49,27 @@ entry:
 }
 
 define i32 @cmov32(i32 %a, i32 %b, i32 %x, ptr %y.ptr) {
-; CHECK-LABEL: cmov32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
-; CHECK-NEXT:    cmoval %edi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x47,0xd7]
-; CHECK-NEXT:    cmoval (%rcx), %edx # EVEX TO LEGACY Compression encoding: [0x0f,0x47,0x11]
-; CHECK-NEXT:    addl %edx, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xd0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: cmov32:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; NDD-NEXT:    movl (%rcx), %eax # encoding: [0x8b,0x01]
+; NDD-NEXT:    cmovbel %edx, %edi # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xfa]
+; NDD-NEXT:    cmovbel %edx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x46,0xc2]
+; NDD-NEXT:    addl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: cmov32:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    cmpl %esi, %edi # encoding: [0x39,0xf7]
+; MEM-NEXT:    movl %edx, %eax # encoding: [0x89,0xd0]
+; MEM-NEXT:    jbe .LBB2_2 # encoding: [0x76,A]
+; MEM-NEXT:    # fixup A - offset: 1, value: .LBB2_2, kind: FK_PCRel_1
+; MEM-NEXT:  # %bb.1: # %entry
+; MEM-NEXT:    movl (%rcx), %eax # encoding: [0x8b,0x01]
+; MEM-NEXT:    movl %edi, %edx # encoding: [0x89,0xfa]
+; MEM-NEXT:  .LBB2_2: # %entry
+; MEM-NEXT:    addl %edx, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xd0]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %cond = icmp ugt i32 %a, %b
   %y = load i32, ptr %y.ptr
@@ -54,13 +80,27 @@ entry:
 }
 
 define i64 @cmov64(i64 %a, i64 %b, i64 %x, ptr %y.ptr) {
-; CHECK-LABEL: cmov64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cmpq %rsi, %rdi # encoding: [0x48,0x39,0xf7]
-; CHECK-NEXT:    cmovaq %rdi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x47,0xd7]
-; CHECK-NEXT:    cmovaq (%rcx), %rdx # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x47,0x11]
-; CHECK-NEXT:    addq %rdx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xd0]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: cmov64:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    cmpq %rsi, %rdi # encoding: [0x48,0x39,0xf7]
+; NDD-NEXT:    movq (%rcx), %rax # encoding: [0x48,0x8b,0x01]
+; NDD-NEXT:    cmovbeq %rdx, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x46,0xfa]
+; NDD-NEXT:    cmovbeq %rdx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x46,0xc2]
+; NDD-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: cmov64:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    cmpq %rsi, %rdi # encoding: [0x48,0x39,0xf7]
+; MEM-NEXT:    movq %rdx, %rax # encoding: [0x48,0x89,0xd0]
+; MEM-NEXT:    jbe .LBB3_2 # encoding: [0x76,A]
+; MEM-NEXT:    # fixup A - offset: 1, value: .LBB3_2, kind: FK_PCRel_1
+; MEM-NEXT:  # %bb.1: # %entry
+; MEM-NEXT:    movq (%rcx), %rax # encoding: [0x48,0x8b,0x01]
+; MEM-NEXT:    movq %rdi, %rdx # encoding: [0x48,0x89,0xfa]
+; MEM-NEXT:  .LBB3_2: # %entry
+; MEM-NEXT:    addq %rdx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xd0]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %cond = icmp ugt i64 %a, %b
   %y = load i64, ptr %y.ptr
@@ -86,11 +126,26 @@ entry:
 }
 
 define i16 @cmov16rm_inv(i16 %a, i16 %x, ptr %y.ptr) {
-; CHECK-LABEL: cmov16rm_inv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
-; CHECK-NEXT:    cmovnsw (%rdx), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x49,0x32]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: cmov16rm_inv:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdx), %eax # encoding: [0x0f,0xb7,0x02]
+; NDD-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
+; NDD-NEXT:    cmovsw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x48,0xc6]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: cmov16rm_inv:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    testw %di, %di # encoding: [0x66,0x85,0xff]
+; MEM-NEXT:    js .LBB5_1 # encoding: [0x78,A]
+; MEM-NEXT:    # fixup A - offset: 1, value: .LBB5_1, kind: FK_PCRel_1
+; MEM-NEXT:  # %bb.2: # %entry
+; MEM-NEXT:    movzwl (%rdx), %eax # encoding: [0x0f,0xb7,0x02]
+; MEM-NEXT:    # kill: def $ax killed $ax killed $eax
+; MEM-NEXT:    retq # encoding: [0xc3]
+; MEM-NEXT:  .LBB5_1:
+; MEM-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; MEM-NEXT:    # kill: def $ax killed $ax killed $eax
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %y = load i16, ptr %y.ptr
   %cmp = icmp slt i16 %a, 0
@@ -99,11 +154,23 @@ entry:
 }
 
 define i32 @cmov32rm_inv(i32 %a, i32 %x, ptr %y.ptr) {
-; CHECK-LABEL: cmov32rm_inv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
-; CHECK-NEXT:    cmovnsl (%rdx), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x49,0x32]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: cmov32rm_inv:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdx), %eax # encoding: [0x8b,0x02]
+; NDD-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; NDD-NEXT:    cmovsl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x48,0xc6]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: cmov32rm_inv:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; MEM-NEXT:    testl %edi, %edi # encoding: [0x85,0xff]
+; MEM-NEXT:    js .LBB6_2 # encoding: [0x78,A]
+; MEM-NEXT:    # fixup A - offset: 1, value: .LBB6_2, kind: FK_PCRel_1
+; MEM-NEXT:  # %bb.1: # %entry
+; MEM-NEXT:    movl (%rdx), %eax # encoding: [0x8b,0x02]
+; MEM-NEXT:  .LBB6_2: # %entry
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %y = load i32, ptr %y.ptr
   %cmp = icmp slt i32 %a, 0
@@ -112,11 +179,23 @@ entry:
 }
 
 define i64 @cmov64rm_inv(i64 %a, i64 %x, ptr %y.ptr) {
-; CHECK-LABEL: cmov64rm_inv:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
-; CHECK-NEXT:    cmovnsq (%rdx), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x49,0x32]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: cmov64rm_inv:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdx), %rax # encoding: [0x48,0x8b,0x02]
+; NDD-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
+; NDD-NEXT:    cmovsq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x48,0xc6]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: cmov64rm_inv:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; MEM-NEXT:    testq %rdi, %rdi # encoding: [0x48,0x85,0xff]
+; MEM-NEXT:    js .LBB7_2 # encoding: [0x78,A]
+; MEM-NEXT:    # fixup A - offset: 1, value: .LBB7_2, kind: FK_PCRel_1
+; MEM-NEXT:  # %bb.1: # %entry
+; MEM-NEXT:    movq (%rdx), %rax # encoding: [0x48,0x8b,0x02]
+; MEM-NEXT:  .LBB7_2: # %entry
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %y = load i64, ptr %y.ptr
   %cmp = icmp slt i64 %a, 0
diff --git a/llvm/test/CodeGen/X86/apx/dec.ll b/llvm/test/CodeGen/X86/apx/dec.ll
index a4b54d59b309d..09eccb13314bb 100644
--- a/llvm/test/CodeGen/X86/apx/dec.ll
+++ b/llvm/test/CodeGen/X86/apx/dec.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,IMM,MEM,BOTH
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
@@ -49,14 +52,24 @@ entry:
 }
 
 define i64 @dec64r(i64 noundef %a) {
-; CHECK-LABEL: dec64r:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    decq %rdi, %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: dec64r:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    leaq -1(%rdi), %rax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: dec64r:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    decq %rdi, %rax
+; IMM-NEXT:    retq
+;
+; MEMONLY-LABEL: dec64r:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    leaq -1(%rdi), %rax
+; MEMONLY-NEXT:    retq
 ;
 ; NF-LABEL: dec64r:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} decq %rdi, %rax
+; NF-NEXT:    leaq -1(%rdi), %rax
 ; NF-NEXT:    retq
 entry:
   %dec = sub i64 %a, 1
@@ -64,14 +77,27 @@ entry:
 }
 
 define i8 @dec8m(ptr %ptr) {
-; CHECK-LABEL: dec8m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    decb (%rdi), %al
-; CHECK-NEXT:    retq
+; NDD-LABEL: dec8m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax
+; NDD-NEXT:    decb %al
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: dec8m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax
+; IMMONLY-NEXT:    decb %al
+; IMMONLY-NEXT:    retq
+;
+; MEM-LABEL: dec8m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    decb (%rdi), %al
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: dec8m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} decb (%rdi), %al
+; NF-NEXT:    movzbl (%rdi), %eax
+; NF-NEXT:    decb %al
 ; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
@@ -80,14 +106,27 @@ entry:
 }
 
 define i16 @dec16m(ptr %ptr) {
-; CHECK-LABEL: dec16m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    decw (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: dec16m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    decw %ax
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: dec16m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax
+; IMMONLY-NEXT:    decw %ax
+; IMMONLY-NEXT:    retq
+;
+; MEM-LABEL: dec16m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    decw (%rdi), %ax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: dec16m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} decw (%rdi), %ax
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    decw %ax
 ; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
@@ -96,14 +135,27 @@ entry:
 }
 
 define i32 @dec32m(ptr %ptr) {
-; CHECK-LABEL: dec32m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    decl (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: dec32m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    decl %eax
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: dec32m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax
+; IMMONLY-NEXT:    decl %eax
+; IMMONLY-NEXT:    retq
+;
+; MEM-LABEL: dec32m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    decl (%rdi), %eax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: dec32m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} decl (%rdi), %eax
+; NF-NEXT:    movl (%rdi), %eax
+; NF-NEXT:    decl %eax
 ; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
@@ -112,14 +164,33 @@ entry:
 }
 
 define i64 @dec64m(ptr %ptr) {
-; CHECK-LABEL: dec64m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    decq (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: dec64m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    decq %rax
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: dec64m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax
+; IMMONLY-NEXT:    decq %rax
+; IMMONLY-NEXT:    retq
+;
+; MEMONLY-LABEL: dec64m:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    movq (%rdi), %rax
+; MEMONLY-NEXT:    decq %rax
+; MEMONLY-NEXT:    retq
+;
+; BOTH-LABEL: dec64m:
+; BOTH:       # %bb.0: # %entry
+; BOTH-NEXT:    decq (%rdi), %rax
+; BOTH-NEXT:    retq
 ;
 ; NF-LABEL: dec64m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} decq (%rdi), %rax
+; NF-NEXT:    movq (%rdi), %rax
+; NF-NEXT:    decq %rax
 ; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/imul.ll b/llvm/test/CodeGen/X86/apx/imul.ll
index 34cc9a90708bd..198f77408311e 100644
--- a/llvm/test/CodeGen/X86/apx/imul.ll
+++ b/llvm/test/CodeGen/X86/apx/imul.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,IMM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
@@ -99,14 +102,27 @@ entry:
 }
 
 define i16 @mul16rm(i16 noundef %a, ptr %ptr) {
-; CHECK-LABEL: mul16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    imulw (%rsi), %di, %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: mul16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax
+; NDD-NEXT:    imulw %di, %ax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: mul16rm:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    movzwl (%rsi), %eax
+; IMM-NEXT:    imulw %di, %ax
+; IMM-NEXT:    retq
+;
+; MEM-LABEL: mul16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    imulw (%rsi), %di, %ax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: mul16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} imulw (%rsi), %di, %ax
+; NF-NEXT:    movzwl (%rsi), %eax
+; NF-NEXT:    imulw %di, %ax
 ; NF-NEXT:    retq
 entry:
   %b = load i16, ptr %ptr
@@ -115,14 +131,27 @@ entry:
 }
 
 define i32 @mul32rm(i32 noundef %a, ptr %ptr) {
-; CHECK-LABEL: mul32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    imull (%rsi), %edi, %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: mul32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax
+; NDD-NEXT:    imull %edi, %eax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: mul32rm:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    movl (%rsi), %eax
+; IMM-NEXT:    imull %edi, %eax
+; IMM-NEXT:    retq
+;
+; MEM-LABEL: mul32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    imull (%rsi), %edi, %eax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: mul32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} imull (%rsi), %edi, %eax
+; NF-NEXT:    movl (%rsi), %eax
+; NF-NEXT:    imull %edi, %eax
 ; NF-NEXT:    retq
 entry:
   %b = load i32, ptr %ptr
@@ -131,14 +160,27 @@ entry:
 }
 
 define i64 @mul64rm(i64 noundef %a, ptr %ptr) {
-; CHECK-LABEL: mul64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    imulq (%rsi), %rdi, %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: mul64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax
+; NDD-NEXT:    imulq %rdi, %rax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: mul64rm:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    movq (%rsi), %rax
+; IMM-NEXT:    imulq %rdi, %rax
+; IMM-NEXT:    retq
+;
+; MEM-LABEL: mul64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    imulq (%rsi), %rdi, %rax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: mul64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} imulq (%rsi), %rdi, %rax
+; NF-NEXT:    movq (%rsi), %rax
+; NF-NEXT:    imulq %rdi, %rax
 ; NF-NEXT:    retq
 entry:
   %b = load i64, ptr %ptr
@@ -147,14 +189,27 @@ entry:
 }
 
 define i16 @smul16rm(i16 noundef %a, ptr %ptr) {
-; CHECK-LABEL: smul16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    imulw (%rsi), %di, %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: smul16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax
+; NDD-NEXT:    imulw %di, %ax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: smul16rm:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    movzwl (%rsi), %eax
+; IMM-NEXT:    imulw %di, %ax
+; IMM-NEXT:    retq
+;
+; MEM-LABEL: smul16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    imulw (%rsi), %di, %ax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: smul16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} imulw (%rsi), %di, %ax
+; NF-NEXT:    movzwl (%rsi), %eax
+; NF-NEXT:    imulw %di, %ax
 ; NF-NEXT:    retq
 entry:
   %b = load i16, ptr %ptr
@@ -164,14 +219,27 @@ entry:
 }
 
 define i32 @smul32rm(i32 noundef %a, ptr %ptr) {
-; CHECK-LABEL: smul32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    imull (%rsi), %edi, %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: smul32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax
+; NDD-NEXT:    imull %edi, %eax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: smul32rm:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    movl (%rsi), %eax
+; IMM-NEXT:    imull %edi, %eax
+; IMM-NEXT:    retq
+;
+; MEM-LABEL: smul32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    imull (%rsi), %edi, %eax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: smul32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} imull (%rsi), %edi, %eax
+; NF-NEXT:    movl (%rsi), %eax
+; NF-NEXT:    imull %edi, %eax
 ; NF-NEXT:    retq
 entry:
   %b = load i32, ptr %ptr
@@ -181,14 +249,27 @@ entry:
 }
 
 define i64 @smul64rm(i64 noundef %a, ptr %ptr) {
-; CHECK-LABEL: smul64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    imulq (%rsi), %rdi, %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: smul64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax
+; NDD-NEXT:    imulq %rdi, %rax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: smul64rm:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    movq (%rsi), %rax
+; IMM-NEXT:    imulq %rdi, %rax
+; IMM-NEXT:    retq
+;
+; MEM-LABEL: smul64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    imulq (%rsi), %rdi, %rax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: smul64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} imulq (%rsi), %rdi, %rax
+; NF-NEXT:    movq (%rsi), %rax
+; NF-NEXT:    imulq %rdi, %rax
 ; NF-NEXT:    retq
 entry:
   %b = load i64, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/inc.ll b/llvm/test/CodeGen/X86/apx/inc.ll
index 671edf83a2b96..aaff438e95ac2 100644
--- a/llvm/test/CodeGen/X86/apx/inc.ll
+++ b/llvm/test/CodeGen/X86/apx/inc.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,IMM,MEM,BOTH
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
@@ -49,14 +52,24 @@ entry:
 }
 
 define i64 @inc64r(i64 noundef %a) {
-; CHECK-LABEL: inc64r:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incq %rdi, %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: inc64r:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    leaq 1(%rdi), %rax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: inc64r:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    incq %rdi, %rax
+; IMM-NEXT:    retq
+;
+; MEMONLY-LABEL: inc64r:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    leaq 1(%rdi), %rax
+; MEMONLY-NEXT:    retq
 ;
 ; NF-LABEL: inc64r:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} incq %rdi, %rax
+; NF-NEXT:    leaq 1(%rdi), %rax
 ; NF-NEXT:    retq
 entry:
   %inc = add i64 %a, 1
@@ -64,14 +77,27 @@ entry:
 }
 
 define i8 @inc8m(ptr %ptr) {
-; CHECK-LABEL: inc8m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incb (%rdi), %al
-; CHECK-NEXT:    retq
+; NDD-LABEL: inc8m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax
+; NDD-NEXT:    incb %al
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: inc8m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax
+; IMMONLY-NEXT:    incb %al
+; IMMONLY-NEXT:    retq
+;
+; MEM-LABEL: inc8m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    incb (%rdi), %al
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: inc8m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} incb (%rdi), %al
+; NF-NEXT:    movzbl (%rdi), %eax
+; NF-NEXT:    incb %al
 ; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
@@ -80,14 +106,27 @@ entry:
 }
 
 define i16 @inc16m(ptr %ptr) {
-; CHECK-LABEL: inc16m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incw (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: inc16m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    incw %ax
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: inc16m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax
+; IMMONLY-NEXT:    incw %ax
+; IMMONLY-NEXT:    retq
+;
+; MEM-LABEL: inc16m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    incw (%rdi), %ax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: inc16m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} incw (%rdi), %ax
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    incw %ax
 ; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
@@ -96,14 +135,27 @@ entry:
 }
 
 define i32 @inc32m(ptr %ptr) {
-; CHECK-LABEL: inc32m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incl (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: inc32m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    incl %eax
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: inc32m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax
+; IMMONLY-NEXT:    incl %eax
+; IMMONLY-NEXT:    retq
+;
+; MEM-LABEL: inc32m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    incl (%rdi), %eax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: inc32m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} incl (%rdi), %eax
+; NF-NEXT:    movl (%rdi), %eax
+; NF-NEXT:    incl %eax
 ; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
@@ -112,14 +164,33 @@ entry:
 }
 
 define i64 @inc64m(ptr %ptr) {
-; CHECK-LABEL: inc64m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incq (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: inc64m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    incq %rax
+; NDD-NEXT:    retq
+;
+; IMMONLY-LABEL: inc64m:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax
+; IMMONLY-NEXT:    incq %rax
+; IMMONLY-NEXT:    retq
+;
+; MEMONLY-LABEL: inc64m:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    movq (%rdi), %rax
+; MEMONLY-NEXT:    incq %rax
+; MEMONLY-NEXT:    retq
+;
+; BOTH-LABEL: inc64m:
+; BOTH:       # %bb.0: # %entry
+; BOTH-NEXT:    incq (%rdi), %rax
+; BOTH-NEXT:    retq
 ;
 ; NF-LABEL: inc64m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} incq (%rdi), %rax
+; NF-NEXT:    movq (%rdi), %rax
+; NF-NEXT:    incq %rax
 ; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
@@ -191,18 +262,32 @@ entry:
 }
 
 define i64 @uinc64r(i64 noundef %a) {
-; CHECK-LABEL: uinc64r:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    incq %rdi, %rax
-; CHECK-NEXT:    movq $-1, %rcx
-; CHECK-NEXT:    cmoveq %rcx, %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: uinc64r:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    incq %rdi
+; NDD-NEXT:    movq $-1, %rax
+; NDD-NEXT:    cmovneq %rdi, %rax
+; NDD-NEXT:    retq
+;
+; IMM-LABEL: uinc64r:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    incq %rdi, %rax
+; IMM-NEXT:    movq $-1, %rcx
+; IMM-NEXT:    cmoveq %rcx, %rax
+; IMM-NEXT:    retq
+;
+; MEMONLY-LABEL: uinc64r:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    incq %rdi
+; MEMONLY-NEXT:    movq $-1, %rax
+; MEMONLY-NEXT:    cmovneq %rdi, %rax
+; MEMONLY-NEXT:    retq
 ;
 ; NF-LABEL: uinc64r:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    incq %rdi, %rax
-; NF-NEXT:    movq $-1, %rcx
-; NF-NEXT:    cmoveq %rcx, %rax
+; NF-NEXT:    incq %rdi
+; NF-NEXT:    movq $-1, %rax
+; NF-NEXT:    cmovneq %rdi, %rax
 ; NF-NEXT:    retq
 entry:
   %inc = call i64 @llvm.uadd.sat.i64(i64 %a, i64 1)
diff --git a/llvm/test/CodeGen/X86/apx/long-instruction-fixup-x32.ll b/llvm/test/CodeGen/X86/apx/long-instruction-fixup-x32.ll
index fb24704fe0491..b1596f95bc542 100644
--- a/llvm/test/CodeGen/X86/apx/long-instruction-fixup-x32.ll
+++ b/llvm/test/CodeGen/X86/apx/long-instruction-fixup-x32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s
 
 
 define i32 @add32mi_SIB_ADSIZE(ptr nocapture noundef readonly %a, i32 noundef %b) {
diff --git a/llvm/test/CodeGen/X86/apx/long-instruction-fixup.ll b/llvm/test/CodeGen/X86/apx/long-instruction-fixup.ll
index 30c485836797f..8d8b306f6893d 100644
--- a/llvm/test/CodeGen/X86/apx/long-instruction-fixup.ll
+++ b/llvm/test/CodeGen/X86/apx/long-instruction-fixup.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s
 
 define i32 @add32mi_GS() {
 ; CHECK-LABEL: add32mi_GS:
diff --git a/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir b/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir
index 7b4945b3a243f..b35e205a59a73 100644
--- a/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir
+++ b/llvm/test/CodeGen/X86/apx/memfold-nd2rmw.mir
@@ -1,4 +1,4 @@
-# RUN: llc %s -o - -start-before=greedy -stop-after=virtregrewriter -mtriple x86_64 -mattr=+ndd | FileCheck %s
+# RUN: llc %s -o - -start-before=greedy -stop-after=virtregrewriter -mtriple x86_64 -mattr=+ndd,+prefer-ndd-mem | FileCheck %s
 #
 # This test is for stack spill folding -- the ADD32ri_ND near the end of the MIR
 # below show be morphed into an ADD32mi by the register allocator, making it
@@ -52,7 +52,7 @@
     br i1 %arg2, label %bb8, label %bb7
   }
 
-  attributes #0 = { "target-features"="+ndd" }
+  attributes #0 = { "target-features"="+ndd,+prefer-ndd-mem" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index 0bb3b179cc305..0ca77c303de3d 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+egpr | FileCheck %s --check-prefix=EGPR
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+egpr,+ndd | FileCheck %s --check-prefix=EGPR-NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+egpr,+ndd,+prefer-ndd-imm,+prefer-ndd-mem | FileCheck %s --check-prefix=EGPR-NDD
 
 define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
 ; EGPR-LABEL: test_1024:
diff --git a/llvm/test/CodeGen/X86/apx/neg.ll b/llvm/test/CodeGen/X86/apx/neg.ll
index 8f8a391487b5a..aed8c73795ca5 100644
--- a/llvm/test/CodeGen/X86/apx/neg.ll
+++ b/llvm/test/CodeGen/X86/apx/neg.ll
@@ -1,7 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+nf -verify-machineinstrs | FileCheck --check-prefix=NF %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+nf -x86-enable-apx-for-relocation=true -verify-machineinstrs | FileCheck --check-prefix=NF %s
 
 define i8 @neg8r(i8 noundef %a) {
 ; CHECK-LABEL: neg8r:
@@ -64,14 +67,21 @@ entry:
 }
 
 define i8 @neg8m(ptr %ptr) {
-; CHECK-LABEL: neg8m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negb (%rdi), %al
-; CHECK-NEXT:    retq
+; NDD-LABEL: neg8m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax
+; NDD-NEXT:    negb %al
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: neg8m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negb (%rdi), %al
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: neg8m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negb (%rdi), %al
+; NF-NEXT:    movzbl (%rdi), %eax
+; NF-NEXT:    negb %al
 ; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
@@ -80,14 +90,21 @@ entry:
 }
 
 define i16 @neg16m(ptr %ptr) {
-; CHECK-LABEL: neg16m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negw (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: neg16m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    negw %ax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: neg16m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negw (%rdi), %ax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: neg16m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negw (%rdi), %ax
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    negw %ax
 ; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
@@ -96,14 +113,21 @@ entry:
 }
 
 define i32 @neg32m(ptr %ptr) {
-; CHECK-LABEL: neg32m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negl (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: neg32m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    negl %eax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: neg32m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negl (%rdi), %eax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: neg32m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negl (%rdi), %eax
+; NF-NEXT:    movl (%rdi), %eax
+; NF-NEXT:    negl %eax
 ; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
@@ -112,14 +136,21 @@ entry:
 }
 
 define i64 @neg64m(ptr %ptr) {
-; CHECK-LABEL: neg64m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negq (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: neg64m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    negq %rax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: neg64m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negq (%rdi), %rax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: neg64m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negq (%rdi), %rax
+; NF-NEXT:    movq (%rdi), %rax
+; NF-NEXT:    negq %rax
 ; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
@@ -192,14 +223,21 @@ entry:
 }
 
 define i8 @uneg8m(ptr %ptr) {
-; CHECK-LABEL: uneg8m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negb (%rdi), %al
-; CHECK-NEXT:    retq
+; NDD-LABEL: uneg8m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax
+; NDD-NEXT:    negb %al
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: uneg8m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negb (%rdi), %al
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: uneg8m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negb (%rdi), %al
+; NF-NEXT:    movzbl (%rdi), %eax
+; NF-NEXT:    negb %al
 ; NF-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
@@ -209,14 +247,21 @@ entry:
 }
 
 define i16 @uneg16m(ptr %ptr) {
-; CHECK-LABEL: uneg16m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negw (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: uneg16m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    negw %ax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: uneg16m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negw (%rdi), %ax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: uneg16m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negw (%rdi), %ax
+; NF-NEXT:    movzwl (%rdi), %eax
+; NF-NEXT:    negw %ax
 ; NF-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
@@ -226,14 +271,21 @@ entry:
 }
 
 define i32 @uneg32m(ptr %ptr) {
-; CHECK-LABEL: uneg32m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negl (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: uneg32m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    negl %eax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: uneg32m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negl (%rdi), %eax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: uneg32m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negl (%rdi), %eax
+; NF-NEXT:    movl (%rdi), %eax
+; NF-NEXT:    negl %eax
 ; NF-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
@@ -243,14 +295,21 @@ entry:
 }
 
 define i64 @uneg64m(ptr %ptr) {
-; CHECK-LABEL: uneg64m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    negq (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: uneg64m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    negq %rax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: uneg64m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    negq (%rdi), %rax
+; MEM-NEXT:    retq
 ;
 ; NF-LABEL: uneg64m:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} negq (%rdi), %rax
+; NF-NEXT:    movq (%rdi), %rax
+; NF-NEXT:    negq %rax
 ; NF-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/not.ll b/llvm/test/CodeGen/X86/apx/not.ll
index f13131ece5a56..fd5e3232b59c1 100644
--- a/llvm/test/CodeGen/X86/apx/not.ll
+++ b/llvm/test/CodeGen/X86/apx/not.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i8 @not8r(i8 noundef %a) {
 ; CHECK-LABEL: not8r:
@@ -42,10 +45,16 @@ entry:
 }
 
 define i8 @not8m(ptr %ptr) {
-; CHECK-LABEL: not8m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    notb (%rdi), %al
-; CHECK-NEXT:    retq
+; NDD-LABEL: not8m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax
+; NDD-NEXT:    notb %al
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: not8m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    notb (%rdi), %al
+; MEM-NEXT:    retq
 entry:
   %a = load i8, ptr %ptr
   %not = xor i8 %a, -1
@@ -53,10 +62,16 @@ entry:
 }
 
 define i16 @not16m(ptr %ptr) {
-; CHECK-LABEL: not16m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    notw (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: not16m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    notw %ax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: not16m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    notw (%rdi), %ax
+; MEM-NEXT:    retq
 entry:
   %a = load i16, ptr %ptr
   %not = xor i16 %a, -1
@@ -64,10 +79,16 @@ entry:
 }
 
 define i32 @not32m(ptr %ptr) {
-; CHECK-LABEL: not32m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    notl (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: not32m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    notl %eax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: not32m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    notl (%rdi), %eax
+; MEM-NEXT:    retq
 entry:
   %a = load i32, ptr %ptr
   %not = xor i32 %a, -1
@@ -75,10 +96,16 @@ entry:
 }
 
 define i64 @not64m(ptr %ptr) {
-; CHECK-LABEL: not64m:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    notq (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: not64m:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    notq %rax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: not64m:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    notq (%rdi), %rax
+; MEM-NEXT:    retq
 entry:
   %a = load i64, ptr %ptr
   %not = xor i64 %a, -1
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index 514a7d83b78b0..eae147d311f32 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,MEM
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
@@ -68,14 +71,27 @@ entry:
 }
 
 define i8 @or8rm(i8 noundef %a, ptr %b) {
-; CHECK-LABEL: or8rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x0a,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or8rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    orb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x08,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or8rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    orb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x08,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or8rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x0a,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or8rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0a,0x3e]
+; NF-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NF-NEXT:    orb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x08,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
@@ -84,14 +100,27 @@ entry:
 }
 
 define i16 @or16rm(i16 noundef %a, ptr %b) {
-; CHECK-LABEL: or16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x0b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    orw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or16rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    orw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x0b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x0b,0x3e]
+; NF-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NF-NEXT:    orw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
@@ -100,14 +129,27 @@ entry:
 }
 
 define i32 @or32rm(i32 noundef %a, ptr %b) {
-; CHECK-LABEL: or32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x0b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    orl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x09,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or32rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    orl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x09,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x0b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0b,0x3e]
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    orl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x09,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -116,14 +158,27 @@ entry:
 }
 
 define i64 @or64rm(i64 noundef %a, ptr %b) {
-; CHECK-LABEL: or64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x0b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    orq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or64rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    orq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x0b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x0b,0x3e]
+; NF-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NF-NEXT:    orq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -243,14 +298,27 @@ entry:
 }
 
 define i8 @or8mr(ptr %a, i8 noundef %b) {
-; CHECK-LABEL: or8mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x0a,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or8mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    orb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x08,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or8mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    orb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x08,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or8mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x0a,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or8mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0a,0x37]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    orb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x08,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -259,14 +327,27 @@ entry:
 }
 
 define i16 @or16mr(ptr %a, i16 noundef %b) {
-; CHECK-LABEL: or16mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x0b,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or16mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    orw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or16mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    orw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or16mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x0b,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x0b,0x37]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    orw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -275,14 +356,27 @@ entry:
 }
 
 define i32 @or32mr(ptr %a, i32 noundef %b) {
-; CHECK-LABEL: or32mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x0b,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or32mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    orl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x09,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or32mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    orl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x09,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or32mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x0b,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or32mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl (%rdi), %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x0b,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    orl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x09,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -291,14 +385,27 @@ entry:
 }
 
 define i64 @or64mr(ptr %a, i64 noundef %b) {
-; CHECK-LABEL: or64mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x0b,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or64mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    orq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or64mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    orq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or64mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x0b,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or64mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orq (%rdi), %rsi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x0b,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    orq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -307,14 +414,27 @@ entry:
 }
 
 define i16 @or16mi8(ptr %a) {
-; CHECK-LABEL: or16mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x0f,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or16mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    orw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc8,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or16mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    orw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc8,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or16mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x0f,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x0f,0x7b]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    orw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc8,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -323,14 +443,27 @@ entry:
 }
 
 define i32 @or32mi8(ptr %a) {
-; CHECK-LABEL: or32mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x0f,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or32mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or32mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or32mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x0f,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or32mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x0f,0x7b]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    orl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc8,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -339,14 +472,27 @@ entry:
 }
 
 define i64 @or64mi8(ptr %a) {
-; CHECK-LABEL: or64mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x0f,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or64mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    orq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc8,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or64mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    orq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc8,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or64mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x0f,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or64mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x0f,0x7b]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    orq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc8,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -355,14 +501,27 @@ entry:
 }
 
 define i8 @or8mi(ptr %a) {
-; CHECK-LABEL: or8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x0f,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    orb $123, %al # EVEX TO LEGACY Compression encoding: [0x0c,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or8mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    orb $123, %al # EVEX TO LEGACY Compression encoding: [0x0c,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x0f,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or8mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x0f,0x7b]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    orb $123, %al # EVEX TO LEGACY Compression encoding: [0x0c,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -371,15 +530,30 @@ entry:
 }
 
 define i16 @or16mi(ptr %a) {
-; CHECK-LABEL: or16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x0f,0xd2,0x04]
-; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    orw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0d,0xd2,0x04]
+; NDD-NEXT:    # imm = 0x4D2
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or16mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    orw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0d,0xd2,0x04]
+; IMMONLY-NEXT:    # imm = 0x4D2
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x0f,0xd2,0x04]
+; MEM-NEXT:    # imm = 0x4D2
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x0f,0xd2,0x04]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    orw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0d,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -389,15 +563,30 @@ entry:
 }
 
 define i32 @or32mi(ptr %a) {
-; CHECK-LABEL: or32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    orl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or32mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    orl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0x40,0xe2,0x01,0x00]
+; IMMONLY-NEXT:    # imm = 0x1E240
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or32mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    orl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x0d,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -407,15 +596,30 @@ entry:
 }
 
 define i64 @or64mi(ptr %a) {
-; CHECK-LABEL: or64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: or64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    orq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0d,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: or64mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    orq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0d,0x40,0xe2,0x01,0x00]
+; IMMONLY-NEXT:    # imm = 0x1E240
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: or64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    orq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or64mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    orq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0d,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -521,19 +725,40 @@ define i1 @orflag64rr(i64 %a, i64 %b) {
 }
 
 define i1 @orflag8rm(ptr %ptr, i8 %b) {
-; CHECK-LABEL: orflag8rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
-; CHECK-NEXT:    orb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x0a,0x07]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: orflag8rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    notb %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0xf6,0xd6]
+; NDD-NEXT:    orb %al, %cl # EVEX TO LEGACY Compression encoding: [0x08,0xc1]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: orflag8rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    notb %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0xf6,0xd6]
+; IMMONLY-NEXT:    orb %al, %cl # EVEX TO LEGACY Compression encoding: [0x08,0xc1]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: orflag8rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
+; MEM-NEXT:    orb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x0a,0x07]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: orflag8rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    notb %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xf6,0xd6]
-; NF-NEXT:    orb (%rdi), %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x0a,0x07]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    notb %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0xf6,0xd6]
+; NF-NEXT:    orb %al, %cl # EVEX TO LEGACY Compression encoding: [0x08,0xc1]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
@@ -547,19 +772,40 @@ define i1 @orflag8rm(ptr %ptr, i8 %b) {
 }
 
 define i1 @orflag16rm(ptr %ptr, i16 %b) {
-; CHECK-LABEL: orflag16rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
-; CHECK-NEXT:    orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: orflag16rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    notw %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0xf7,0xd6]
+; NDD-NEXT:    orw %ax, %cx # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xc1]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: orflag16rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    notw %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0xf7,0xd6]
+; IMMONLY-NEXT:    orw %ax, %cx # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xc1]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: orflag16rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
+; MEM-NEXT:    orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: orflag16rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    notw %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xf7,0xd6]
-; NF-NEXT:    orw (%rdi), %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x0b,0x07]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    notw %si, %cx # encoding: [0x62,0xf4,0x75,0x18,0xf7,0xd6]
+; NF-NEXT:    orw %ax, %cx # EVEX TO LEGACY Compression encoding: [0x66,0x09,0xc1]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
@@ -573,17 +819,36 @@ define i1 @orflag16rm(ptr %ptr, i16 %b) {
 }
 
 define i1 @orflag32rm(ptr %ptr, i32 %b) {
-; CHECK-LABEL: orflag32rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x0b,0x37]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: orflag32rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    orl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x09,0xc6]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: orflag32rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    orl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x09,0xc6]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: orflag32rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    orl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x0b,0x37]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: orflag32rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    orl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x0b,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    orl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x09,0xc6]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
@@ -596,17 +861,36 @@ define i1 @orflag32rm(ptr %ptr, i32 %b) {
 }
 
 define i1 @orflag64rm(ptr %ptr, i64 %b) {
-; CHECK-LABEL: orflag64rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x0b,0x37]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: orflag64rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    orq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xc6]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: orflag64rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    orq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xc6]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: orflag64rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    orq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x0b,0x37]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: orflag64rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    orq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x0b,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    orq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x09,0xc6]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
@@ -691,18 +975,36 @@ define i1 @orflag32ri(i32 %a) {
 }
 
 define i1 @orflag64ri(i64 %a) {
-; CHECK-LABEL: orflag64ri:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: orflag64ri:
+; NDD:       # %bb.0:
+; NDD-NEXT:    orq $123456, %rdi # encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: orflag64ri:
+; IMM:       # %bb.0:
+; IMM-NEXT:    orq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; IMM-NEXT:    # imm = 0x1E240
+; IMM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMM-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; IMM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: orflag64ri:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    orq $123456, %rdi # encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; MEMONLY-NEXT:    # imm = 0x1E240
+; MEMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEMONLY-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; MEMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: orflag64ri:
 ; NF:       # %bb.0:
-; NF-NEXT:    orq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    orq $123456, %rdi # encoding: [0x48,0x81,0xcf,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
@@ -760,17 +1062,33 @@ define i1 @orflag32ri8(i32 %a) {
 }
 
 define i1 @orflag64ri8(i64 %a) {
-; CHECK-LABEL: orflag64ri8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    orq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xcf,0x7b]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: orflag64ri8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    orq $123, %rdi # encoding: [0x48,0x83,0xcf,0x7b]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: orflag64ri8:
+; IMM:       # %bb.0:
+; IMM-NEXT:    orq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xcf,0x7b]
+; IMM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMM-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; IMM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: orflag64ri8:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    orq $123, %rdi # encoding: [0x48,0x83,0xcf,0x7b]
+; MEMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEMONLY-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; MEMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: orflag64ri8:
 ; NF:       # %bb.0:
-; NF-NEXT:    orq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xcf,0x7b]
+; NF-NEXT:    orq $123, %rdi # encoding: [0x48,0x83,0xcf,0x7b]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
diff --git a/llvm/test/CodeGen/X86/apx/rol.ll b/llvm/test/CodeGen/X86/apx/rol.ll
index 58ddc55ab6aba..098ea96c4bae6 100644
--- a/llvm/test/CodeGen/X86/apx/rol.ll
+++ b/llvm/test/CodeGen/X86/apx/rol.ll
@@ -1,11 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i8 @rol8m1(ptr %ptr) {
-; CHECK-LABEL: rol8m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol8m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    rolb %al # EVEX TO LEGACY Compression encoding: [0xd0,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol8m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %0 = shl i8 %a, 1
@@ -15,20 +24,32 @@ entry:
 }
 
 define i8 @rol8m1_intrinsic(ptr %ptr)  {
-; CHECK-LABEL: rol8m1_intrinsic:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    rolb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol8m1_intrinsic:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    rolb %al # EVEX TO LEGACY Compression encoding: [0xd0,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol8m1_intrinsic:
+; MEM:       # %bb.0:
+; MEM-NEXT:    rolb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %f = call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 7)
   ret i8 %f
 }
 
 define i16 @rol16m1(ptr %ptr) {
-; CHECK-LABEL: rol16m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol16m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    rolw %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd1,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol16m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %0 = shl i16 %a, 1
@@ -38,20 +59,32 @@ entry:
 }
 
 define i16 @rol16m1_intrinsic(ptr %ptr)  {
-; CHECK-LABEL: rol16m1_intrinsic:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    rolw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol16m1_intrinsic:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    rolw %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd1,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol16m1_intrinsic:
+; MEM:       # %bb.0:
+; MEM-NEXT:    rolw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %f = call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 15)
   ret i16 %f
 }
 
 define i32 @rol32m1(ptr %ptr) {
-; CHECK-LABEL: rol32m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    roll (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol32m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    roll %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol32m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    roll (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %0 = shl i32 %a, 1
@@ -61,20 +94,32 @@ entry:
 }
 
 define i32 @rol32m1_intrinsic(ptr %ptr)  {
-; CHECK-LABEL: rol32m1_intrinsic:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    roll (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol32m1_intrinsic:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    roll %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol32m1_intrinsic:
+; MEM:       # %bb.0:
+; MEM-NEXT:    roll (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %f = call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 31)
   ret i32 %f
 }
 
 define i64 @rol64m1(ptr %ptr) {
-; CHECK-LABEL: rol64m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol64m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    rolq %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd1,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol64m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %0 = shl i64 %a, 1
@@ -84,22 +129,36 @@ entry:
 }
 
 define i64 @rol64m1_intrinsic(ptr %ptr)  {
-; CHECK-LABEL: rol64m1_intrinsic:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    rolq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol64m1_intrinsic:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    rolq %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd1,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol64m1_intrinsic:
+; MEM:       # %bb.0:
+; MEM-NEXT:    rolq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %f = call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63)
   ret i64 %f
 }
 
 define i8 @rol8mcl(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: rol8mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rolb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol8mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rolb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol8mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rolb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %0 = shl i8 %a, %cl
@@ -110,12 +169,20 @@ entry:
 }
 
 define i16 @rol16mcl(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: rol16mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rolw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol16mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rolw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol16mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rolw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %0 = shl i16 %a, %cl
@@ -126,12 +193,20 @@ entry:
 }
 
 define i32 @rol32mcl(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: rol32mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    roll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol32mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    roll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol32mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    roll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %0 = shl i32 %a, %cl
@@ -142,12 +217,20 @@ entry:
 }
 
 define i64 @rol64mcl(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: rol64mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    rolq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x07]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol64mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    rolq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xc0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol64mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    rolq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x07]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %0 = shl i64 %a, %cl
@@ -158,10 +241,16 @@ entry:
 }
 
 define i8 @rol8mi(ptr %ptr) {
-; CHECK-LABEL: rol8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolb $3, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x07,0x03]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    rolb $3, %al # EVEX TO LEGACY Compression encoding: [0xc0,0xc0,0x03]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolb $3, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x07,0x03]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %0 = shl i8 %a, 3
@@ -171,10 +260,16 @@ entry:
 }
 
 define i16 @rol16mi(ptr %ptr) {
-; CHECK-LABEL: rol16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolw $3, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x07,0x03]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    rolw $3, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xc1,0xc0,0x03]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolw $3, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x07,0x03]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %0 = shl i16 %a, 3
@@ -184,10 +279,16 @@ entry:
 }
 
 define i32 @rol32mi(ptr %ptr) {
-; CHECK-LABEL: rol32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    roll $3, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x07,0x03]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    roll $3, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xc0,0x03]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    roll $3, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x07,0x03]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %0 = shl i32 %a, 3
@@ -197,10 +298,16 @@ entry:
 }
 
 define i64 @rol64mi(ptr %ptr) {
-; CHECK-LABEL: rol64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolq $3, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x07,0x03]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: rol64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    rolq $3, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xc0,0x03]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: rol64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolq $3, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x07,0x03]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %0 = shl i64 %a, 3
diff --git a/llvm/test/CodeGen/X86/apx/ror.ll b/llvm/test/CodeGen/X86/apx/ror.ll
index e2b65e776ed57..4cb7421c218fb 100644
--- a/llvm/test/CodeGen/X86/apx/ror.ll
+++ b/llvm/test/CodeGen/X86/apx/ror.ll
@@ -1,11 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i8 @ror8m1(ptr %ptr) {
-; CHECK-LABEL: ror8m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rorb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror8m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    rorb %al # EVEX TO LEGACY Compression encoding: [0xd0,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror8m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rorb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %0 = lshr i8 %a, 1
@@ -15,10 +24,16 @@ entry:
 }
 
 define i16 @ror16m1(ptr %ptr) {
-; CHECK-LABEL: ror16m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rorw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror16m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    rorw %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd1,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror16m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rorw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %0 = lshr i16 %a, 1
@@ -28,10 +43,16 @@ entry:
 }
 
 define i32 @ror32m1(ptr %ptr) {
-; CHECK-LABEL: ror32m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rorl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror32m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    rorl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror32m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rorl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %0 = lshr i32 %a, 1
@@ -41,10 +62,16 @@ entry:
 }
 
 define i64 @ror64m1(ptr %ptr) {
-; CHECK-LABEL: ror64m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rorq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror64m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    rorq %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd1,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror64m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rorq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %0 = lshr i64 %a, 1
@@ -54,12 +81,20 @@ entry:
 }
 
 define i8 @ror8mcl(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: ror8mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rorb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror8mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rorb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror8mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rorb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %0 = lshr i8 %a, %cl
@@ -70,12 +105,20 @@ entry:
 }
 
 define i8 @ror8mcl_mask(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: ror8mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rorb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror8mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rorb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror8mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rorb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i8 %cl, 31
   %a = load i8, ptr %ptr
@@ -87,12 +130,20 @@ entry:
 }
 
 define i16 @ror16mcl(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: ror16mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rorw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror16mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rorw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror16mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rorw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %0 = lshr i16 %a, %cl
@@ -103,12 +154,20 @@ entry:
 }
 
 define i16 @ror16mcl_mask(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: ror16mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rorw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror16mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rorw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror16mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rorw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i16 %cl, 31
   %a = load i16, ptr %ptr
@@ -120,12 +179,20 @@ entry:
 }
 
 define i32 @ror32mcl(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: ror32mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rorl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror32mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rorl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror32mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rorl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %0 = lshr i32 %a, %cl
@@ -136,12 +203,20 @@ entry:
 }
 
 define i32 @ror32mcl_mask(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: ror32mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    rorl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror32mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    rorl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror32mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    rorl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i32 %cl, 31
   %a = load i32, ptr %ptr
@@ -153,12 +228,20 @@ entry:
 }
 
 define i64 @ror64mcl(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: ror64mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    rorq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror64mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    rorq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror64mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    rorq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %0 = lshr i64 %a, %cl
@@ -169,12 +252,20 @@ entry:
 }
 
 define i64 @ror64mcl_mask(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: ror64mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    rorq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x0f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror64mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    rorq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror64mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    rorq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x0f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %shamt = and i64 %cl, 63
   %a = load i64, ptr %ptr
@@ -186,10 +277,16 @@ entry:
 }
 
 define i8 @ror8mi(ptr %ptr) {
-; CHECK-LABEL: ror8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolb $5, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x07,0x05]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    rolb $5, %al # EVEX TO LEGACY Compression encoding: [0xc0,0xc0,0x05]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolb $5, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x07,0x05]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %0 = lshr i8 %a, 3
@@ -199,10 +296,16 @@ entry:
 }
 
 define i16 @ror16mi(ptr %ptr) {
-; CHECK-LABEL: ror16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolw $13, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x07,0x0d]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    rolw $13, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xc1,0xc0,0x0d]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolw $13, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x07,0x0d]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %0 = lshr i16 %a, 3
@@ -212,10 +315,16 @@ entry:
 }
 
 define i32 @ror32mi(ptr %ptr) {
-; CHECK-LABEL: ror32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    roll $29, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x07,0x1d]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    roll $29, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xc0,0x1d]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    roll $29, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x07,0x1d]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %0 = lshr i32 %a, 3
@@ -225,10 +334,16 @@ entry:
 }
 
 define i64 @ror64mi(ptr %ptr) {
-; CHECK-LABEL: ror64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    rolq $61, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x07,0x3d]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: ror64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    rolq $61, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xc0,0x3d]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: ror64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    rolq $61, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x07,0x3d]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %0 = lshr i64 %a, 3
diff --git a/llvm/test/CodeGen/X86/apx/sar.ll b/llvm/test/CodeGen/X86/apx/sar.ll
index 25c067b46b9ce..91790e2f8f31e 100644
--- a/llvm/test/CodeGen/X86/apx/sar.ll
+++ b/llvm/test/CodeGen/X86/apx/sar.ll
@@ -1,11 +1,20 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i8 @sar8m1(ptr %ptr) {
-; CHECK-LABEL: sar8m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar8m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    sarb %al # EVEX TO LEGACY Compression encoding: [0xd0,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar8m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %sar = ashr i8 %a, 1
@@ -13,10 +22,16 @@ entry:
 }
 
 define i16 @sar16m1(ptr %ptr) {
-; CHECK-LABEL: sar16m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar16m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    sarw %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd1,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar16m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %sar = ashr i16 %a, 1
@@ -24,10 +39,16 @@ entry:
 }
 
 define i32 @sar32m1(ptr %ptr) {
-; CHECK-LABEL: sar32m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar32m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    sarl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar32m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %sar = ashr i32 %a, 1
@@ -35,10 +56,16 @@ entry:
 }
 
 define i64 @sar64m1(ptr %ptr) {
-; CHECK-LABEL: sar64m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar64m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    sarq %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd1,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar64m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %sar = ashr i64 %a, 1
@@ -46,12 +73,20 @@ entry:
 }
 
 define i8 @sar8mcl(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: sar8mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar8mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    sarb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar8mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    sarb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %sar = ashr i8 %a, %cl
@@ -59,12 +94,20 @@ entry:
 }
 
 define i8 @sar8mcl_mask(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: sar8mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar8mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    sarb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar8mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    sarb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %shamt = and i8 %cl, 31
@@ -73,12 +116,20 @@ entry:
 }
 
 define i16 @sar16mcl(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: sar16mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar16mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    sarw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar16mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    sarw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %sar = ashr i16 %a, %cl
@@ -86,12 +137,20 @@ entry:
 }
 
 define i16 @sar16mcl_mask(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: sar16mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar16mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    sarw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar16mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    sarw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %shamt = and i16 %cl, 31
@@ -100,12 +159,20 @@ entry:
 }
 
 define i32 @sar32mcl(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: sar32mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar32mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar32mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    sarl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %sar = ashr i32 %a, %cl
@@ -113,12 +180,20 @@ entry:
 }
 
 define i32 @sar32mcl_mask(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: sar32mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar32mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar32mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    sarl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %shamt = and i32 %cl, 31
@@ -127,12 +202,20 @@ entry:
 }
 
 define i64 @sar64mcl(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: sar64mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    sarq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar64mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    sarq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar64mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    sarq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %sar = ashr i64 %a, %cl
@@ -140,12 +223,20 @@ entry:
 }
 
 define i64 @sar64mcl_mask(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: sar64mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    sarq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x3f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar64mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    sarq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar64mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    sarq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x3f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %shamt = and i64 %cl, 63
@@ -154,10 +245,16 @@ entry:
 }
 
 define i8 @sar8mi(ptr %ptr) {
-; CHECK-LABEL: sar8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x3f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    sarb $4, %al # EVEX TO LEGACY Compression encoding: [0xc0,0xf8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x3f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
   %sar = ashr i8 %a, 4
@@ -165,10 +262,16 @@ entry:
 }
 
 define i16 @sar16mi(ptr %ptr) {
-; CHECK-LABEL: sar16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x3f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    sarw $4, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xc1,0xf8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x3f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
   %sar = ashr i16 %a, 4
@@ -176,10 +279,16 @@ entry:
 }
 
 define i32 @sar32mi(ptr %ptr) {
-; CHECK-LABEL: sar32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x3f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    sarl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xf8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x3f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
   %sar = ashr i32 %a, 4
@@ -187,10 +296,16 @@ entry:
 }
 
 define i64 @sar64mi(ptr %ptr) {
-; CHECK-LABEL: sar64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    sarq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x3f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sar64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    sarq $4, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xf8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sar64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    sarq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x3f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
   %sar = ashr i64 %a, 4
diff --git a/llvm/test/CodeGen/X86/apx/sbb.ll b/llvm/test/CodeGen/X86/apx/sbb.ll
index a67419bbd5db7..94a144b125f6e 100644
--- a/llvm/test/CodeGen/X86/apx/sbb.ll
+++ b/llvm/test/CodeGen/X86/apx/sbb.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,MEM,BOTH
 
 define i8 @sbb8rr(i8 %a, i8 %b, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: sbb8rr:
@@ -54,11 +57,27 @@ define i64 @sbb64rr(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 }
 
 define i8 @sbb8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: sbb8rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
-; CHECK-NEXT:    sbbb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x1a,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb8rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; NDD-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; NDD-NEXT:    {evex} sbbb (%rsi), %al # encoding: [0x62,0xf4,0x7c,0x08,0x1a,0x06]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb8rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; IMMONLY-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; IMMONLY-NEXT:    {evex} sbbb (%rsi), %al # encoding: [0x62,0xf4,0x7c,0x08,0x1a,0x06]
+; IMMONLY-NEXT:    # kill: def $al killed $al killed $eax
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb8rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; MEM-NEXT:    sbbb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x1a,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
   %s = sub i8 %a, %b
   %k = icmp ugt i8 %x, %y
@@ -68,11 +87,27 @@ define i8 @sbb8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 }
 
 define i16 @sbb16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: sbb16rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
-; CHECK-NEXT:    sbbw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x1b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb16rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; NDD-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; NDD-NEXT:    {evex} sbbw (%rsi), %ax # encoding: [0x62,0xf4,0x7d,0x08,0x1b,0x06]
+; NDD-NEXT:    # kill: def $ax killed $ax killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb16rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; IMMONLY-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; IMMONLY-NEXT:    {evex} sbbw (%rsi), %ax # encoding: [0x62,0xf4,0x7d,0x08,0x1b,0x06]
+; IMMONLY-NEXT:    # kill: def $ax killed $ax killed $eax
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb16rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; MEM-NEXT:    sbbw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x1b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
   %s = sub i16 %a, %b
   %k = icmp ugt i16 %x, %y
@@ -82,11 +117,25 @@ define i16 @sbb16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @sbb32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: sbb32rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
-; CHECK-NEXT:    sbbl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x1b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb32rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; NDD-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; NDD-NEXT:    {evex} sbbl (%rsi), %eax # encoding: [0x62,0xf4,0x7c,0x08,0x1b,0x06]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb32rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl %edi, %eax # encoding: [0x89,0xf8]
+; IMMONLY-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; IMMONLY-NEXT:    {evex} sbbl (%rsi), %eax # encoding: [0x62,0xf4,0x7c,0x08,0x1b,0x06]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb32rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; MEM-NEXT:    sbbl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x1b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
   %s = sub i32 %a, %b
   %k = icmp ugt i32 %x, %y
@@ -96,11 +145,25 @@ define i32 @sbb32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @sbb64rm(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: sbb64rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
-; CHECK-NEXT:    sbbq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x1b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb64rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; NDD-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; NDD-NEXT:    {evex} sbbq (%rsi), %rax # encoding: [0x62,0xf4,0xfc,0x08,0x1b,0x06]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb64rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movq %rdi, %rax # encoding: [0x48,0x89,0xf8]
+; IMMONLY-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; IMMONLY-NEXT:    {evex} sbbq (%rsi), %rax # encoding: [0x62,0xf4,0xfc,0x08,0x1b,0x06]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb64rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; MEM-NEXT:    sbbq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x1b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
   %s = sub i64 %a, %b
   %k = icmp ugt i64 %x, %y
@@ -138,12 +201,26 @@ define i32 @sbb32ri8(i32 %a, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @sbb64ri8(i64 %a, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: sbb64ri8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
-; CHECK-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
-; CHECK-NEXT:    addq $-123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x85]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb64ri8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; NDD-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
+; NDD-NEXT:    addq $-123, %rax # encoding: [0x48,0x83,0xc0,0x85]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: sbb64ri8:
+; IMM:       # %bb.0:
+; IMM-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; IMM-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
+; IMM-NEXT:    addq $-123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x85]
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: sbb64ri8:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; MEMONLY-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
+; MEMONLY-NEXT:    addq $-123, %rax # encoding: [0x48,0x83,0xc0,0x85]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
   %s = sub i64 %a, 123
   %k = icmp ugt i64 %x, %y
   %z = zext i1 %k to i64
@@ -196,13 +273,29 @@ define i32 @sbb32ri(i32 %a, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @sbb64ri(i64 %a, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: sbb64ri:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
-; CHECK-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
-; CHECK-NEXT:    addq $-123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
-; CHECK-NEXT:    # imm = 0xFFFE1DC0
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb64ri:
+; NDD:       # %bb.0:
+; NDD-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; NDD-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
+; NDD-NEXT:    addq $-123456, %rax # encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
+; NDD-NEXT:    # imm = 0xFFFE1DC0
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: sbb64ri:
+; IMM:       # %bb.0:
+; IMM-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; IMM-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
+; IMM-NEXT:    addq $-123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
+; IMM-NEXT:    # imm = 0xFFFE1DC0
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: sbb64ri:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; MEMONLY-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xdf,0x00]
+; MEMONLY-NEXT:    addq $-123456, %rax # encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
+; MEMONLY-NEXT:    # imm = 0xFFFE1DC0
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
   %s = sub i64 %a, 123456
   %k = icmp ugt i64 %x, %y
   %z = zext i1 %k to i64
@@ -211,11 +304,25 @@ define i64 @sbb64ri(i64 %a, i64 %x, i64 %y) nounwind {
 }
 
 define i8 @sbb8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: sbb8mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
-; CHECK-NEXT:    sbbb %dil, (%rsi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x18,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb8mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; NDD-NEXT:    sbbb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x18,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb8mr:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; IMMONLY-NEXT:    sbbb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x18,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb8mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
+; MEM-NEXT:    sbbb %dil, (%rsi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x18,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
   %s = sub i8 %b, %a
   %k = icmp ugt i8 %x, %y
@@ -225,11 +332,25 @@ define i8 @sbb8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 }
 
 define i16 @sbb16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: sbb16mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
-; CHECK-NEXT:    sbbw %di, (%rsi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x19,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb16mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; NDD-NEXT:    sbbw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x19,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb16mr:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; IMMONLY-NEXT:    sbbw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x19,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb16mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
+; MEM-NEXT:    sbbw %di, (%rsi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x19,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
   %s = sub i16 %b, %a
   %k = icmp ugt i16 %x, %y
@@ -239,11 +360,25 @@ define i16 @sbb16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @sbb32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: sbb32mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
-; CHECK-NEXT:    sbbl %edi, (%rsi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x19,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb32mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; NDD-NEXT:    sbbl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x19,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb32mr:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; IMMONLY-NEXT:    sbbl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x19,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb32mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
+; MEM-NEXT:    sbbl %edi, (%rsi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x19,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
   %s = sub i32 %b, %a
   %k = icmp ugt i32 %x, %y
@@ -253,11 +388,25 @@ define i32 @sbb32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @sbb64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: sbb64mr:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
-; CHECK-NEXT:    sbbq %rdi, (%rsi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x19,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb64mr:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; NDD-NEXT:    sbbq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x19,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb64mr:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; IMMONLY-NEXT:    sbbq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x19,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb64mr:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
+; MEM-NEXT:    sbbq %rdi, (%rsi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x19,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
   %s = sub i64 %b, %a
   %k = icmp ugt i64 %x, %y
@@ -267,12 +416,28 @@ define i64 @sbb64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 }
 
 define i16 @sbb16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: sbb16mi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addw $-123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x85]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb16mi8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; NDD-NEXT:    sbbw $0, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xd8,0x00]
+; NDD-NEXT:    addw $-123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x85]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb16mi8:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; IMMONLY-NEXT:    sbbw $0, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xd8,0x00]
+; IMMONLY-NEXT:    addw $-123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x85]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb16mi8:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; MEM-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x1f,0x00]
+; MEM-NEXT:    addw $-123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xc0,0x85]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = sub i16 %a, 123
   %k = icmp ugt i16 %x, %y
@@ -282,12 +447,28 @@ define i16 @sbb16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @sbb32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: sbb32mi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
-; CHECK-NEXT:    sbbl $0, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb32mi8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; NDD-NEXT:    sbbl $0, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xd8,0x00]
+; NDD-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb32mi8:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; IMMONLY-NEXT:    sbbl $0, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xd8,0x00]
+; IMMONLY-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb32mi8:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; MEM-NEXT:    sbbl $0, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x1f,0x00]
+; MEM-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %s = sub i32 %a, 123
   %k = icmp ugt i32 %x, %y
@@ -297,12 +478,35 @@ define i32 @sbb32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @sbb64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: sbb64mi8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
-; CHECK-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addq $-123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x85]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb64mi8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; NDD-NEXT:    sbbq $0, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xd8,0x00]
+; NDD-NEXT:    addq $-123, %rax # encoding: [0x48,0x83,0xc0,0x85]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb64mi8:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; IMMONLY-NEXT:    sbbq $0, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xd8,0x00]
+; IMMONLY-NEXT:    addq $-123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x85]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: sbb64mi8:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; MEMONLY-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x1f,0x00]
+; MEMONLY-NEXT:    addq $-123, %rax # encoding: [0x48,0x83,0xc0,0x85]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; BOTH-LABEL: sbb64mi8:
+; BOTH:       # %bb.0:
+; BOTH-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; BOTH-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x1f,0x00]
+; BOTH-NEXT:    addq $-123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x85]
+; BOTH-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %s = sub i64 %a, 123
   %k = icmp ugt i64 %x, %y
@@ -312,12 +516,28 @@ define i64 @sbb64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
 }
 
 define i8 @sbb8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
-; CHECK-LABEL: sbb8mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
-; CHECK-NEXT:    sbbb $0, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x1f,0x00]
-; CHECK-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb8mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
+; NDD-NEXT:    sbbb $0, %al # EVEX TO LEGACY Compression encoding: [0x1c,0x00]
+; NDD-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb8mi:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
+; IMMONLY-NEXT:    sbbb $0, %al # EVEX TO LEGACY Compression encoding: [0x1c,0x00]
+; IMMONLY-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb8mi:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
+; MEM-NEXT:    sbbb $0, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x1f,0x00]
+; MEM-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
   %s = sub i8 %a, 123
   %k = icmp ugt i8 %x, %y
@@ -327,13 +547,31 @@ define i8 @sbb8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
 }
 
 define i16 @sbb16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
-; CHECK-LABEL: sbb16mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
-; CHECK-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
-; CHECK-NEXT:    # imm = 0xFB2E
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb16mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; NDD-NEXT:    sbbw $0, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xd8,0x00]
+; NDD-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
+; NDD-NEXT:    # imm = 0xFB2E
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb16mi:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; IMMONLY-NEXT:    sbbw $0, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xd8,0x00]
+; IMMONLY-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
+; IMMONLY-NEXT:    # imm = 0xFB2E
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb16mi:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
+; MEM-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x1f,0x00]
+; MEM-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
+; MEM-NEXT:    # imm = 0xFB2E
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i16, ptr %ptr
   %s = sub i16 %a, 1234
   %k = icmp ugt i16 %x, %y
@@ -343,13 +581,31 @@ define i16 @sbb16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 }
 
 define i32 @sbb32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
-; CHECK-LABEL: sbb32mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
-; CHECK-NEXT:    sbbl $0, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
-; CHECK-NEXT:    # imm = 0xFFFE1DC0
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb32mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; NDD-NEXT:    sbbl $0, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xd8,0x00]
+; NDD-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
+; NDD-NEXT:    # imm = 0xFFFE1DC0
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb32mi:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; IMMONLY-NEXT:    sbbl $0, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xd8,0x00]
+; IMMONLY-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
+; IMMONLY-NEXT:    # imm = 0xFFFE1DC0
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sbb32mi:
+; MEM:       # %bb.0:
+; MEM-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
+; MEM-NEXT:    sbbl $0, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x1f,0x00]
+; MEM-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
+; MEM-NEXT:    # imm = 0xFFFE1DC0
+; MEM-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
   %s = sub i32 %a, 123456
   %k = icmp ugt i32 %x, %y
@@ -359,13 +615,39 @@ define i32 @sbb32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
 }
 
 define i64 @sbb64mi(ptr %ptr, i64 %x, i64 %y) nounwind {
-; CHECK-LABEL: sbb64mi:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
-; CHECK-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x1f,0x00]
-; CHECK-NEXT:    addq $-123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
-; CHECK-NEXT:    # imm = 0xFFFE1DC0
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sbb64mi:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; NDD-NEXT:    sbbq $0, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xd8,0x00]
+; NDD-NEXT:    addq $-123456, %rax # encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
+; NDD-NEXT:    # imm = 0xFFFE1DC0
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sbb64mi:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; IMMONLY-NEXT:    sbbq $0, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xd8,0x00]
+; IMMONLY-NEXT:    addq $-123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
+; IMMONLY-NEXT:    # imm = 0xFFFE1DC0
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: sbb64mi:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; MEMONLY-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x1f,0x00]
+; MEMONLY-NEXT:    addq $-123456, %rax # encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
+; MEMONLY-NEXT:    # imm = 0xFFFE1DC0
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; BOTH-LABEL: sbb64mi:
+; BOTH:       # %bb.0:
+; BOTH-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
+; BOTH-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x1f,0x00]
+; BOTH-NEXT:    addq $-123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
+; BOTH-NEXT:    # imm = 0xFFFE1DC0
+; BOTH-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
   %s = sub i64 %a, 123456
   %k = icmp ugt i64 %x, %y
diff --git a/llvm/test/CodeGen/X86/apx/shl.ll b/llvm/test/CodeGen/X86/apx/shl.ll
index 9c6229a483c73..8d4ff1fae4643 100644
--- a/llvm/test/CodeGen/X86/apx/shl.ll
+++ b/llvm/test/CodeGen/X86/apx/shl.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
@@ -136,18 +139,27 @@ entry:
 }
 
 define i8 @shl8mcl(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: shl8mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl8mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shlb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl8mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl8mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27]
+; NF-NEXT:    shlb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
@@ -156,18 +168,27 @@ entry:
 }
 
 define i8 @shl8mcl_mask(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: shl8mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl8mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shlb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl8mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl8mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shlb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x27]
+; NF-NEXT:    shlb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
@@ -177,18 +198,27 @@ entry:
 }
 
 define i16 @shl16mcl(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: shl16mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shlw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl16mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shlw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl16mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shlw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shlw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x27]
+; NF-NEXT:    shlw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -197,18 +227,27 @@ entry:
 }
 
 define i16 @shl16mcl_mask(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: shl16mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shlw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl16mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shlw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl16mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shlw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shlw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x27]
+; NF-NEXT:    shlw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -218,18 +257,27 @@ entry:
 }
 
 define i32 @shl32mcl(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: shl32mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl32mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl32mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl32mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27]
+; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
@@ -238,18 +286,27 @@ entry:
 }
 
 define i32 @shl32mcl_mask(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: shl32mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl32mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl32mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl32mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shll %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x27]
+; NF-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
@@ -259,18 +316,27 @@ entry:
 }
 
 define i64 @shl64mcl(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: shl64mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl64mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    shlq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl64mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl64mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
-; NF-NEXT:    {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27]
+; NF-NEXT:    shlq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
@@ -279,18 +345,27 @@ entry:
 }
 
 define i64 @shl64mcl_mask(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: shl64mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl64mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    shlq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl64mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl64mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
-; NF-NEXT:    {nf} shlq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x27]
+; NF-NEXT:    shlq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
@@ -300,14 +375,21 @@ entry:
 }
 
 define i8 @shl8mi(ptr %ptr) {
-; CHECK-LABEL: shl8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shlb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x27,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    shlb $4, %al # EVEX TO LEGACY Compression encoding: [0xc0,0xe0,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shlb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x27,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl8mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shlb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x27,0x04]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    shlb $4, %al # EVEX TO LEGACY Compression encoding: [0xc0,0xe0,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
@@ -316,14 +398,21 @@ entry:
 }
 
 define i16 @shl16mi(ptr %ptr) {
-; CHECK-LABEL: shl16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shlw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x27,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    shlw $4, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xc1,0xe0,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shlw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x27,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shlw $4, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xc1,0x27,0x04]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shlw $4, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xc1,0xe0,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -332,14 +421,21 @@ entry:
 }
 
 define i32 @shl32mi(ptr %ptr) {
-; CHECK-LABEL: shl32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shll $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x27,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shll $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x27,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl32mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shll $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x27,0x04]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
@@ -348,14 +444,21 @@ entry:
 }
 
 define i64 @shl64mi(ptr %ptr) {
-; CHECK-LABEL: shl64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shlq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x27,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shl64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    shlq $4, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe0,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shl64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shlq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x27,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shl64mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shlq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x27,0x04]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    shlq $4, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe0,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/shld.ll b/llvm/test/CodeGen/X86/apx/shld.ll
index 5b99719ba537c..de91ac0acca38 100644
--- a/llvm/test/CodeGen/X86/apx/shld.ll
+++ b/llvm/test/CodeGen/X86/apx/shld.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i16 @shld16rrcl(i16 noundef %a, i16 noundef %b, i8 %cl) {
 ; CHECK-LABEL: shld16rrcl:
@@ -111,11 +114,18 @@ entry:
 }
 
 define i16 @shld16mrcl(ptr %ptr, i16 noundef %b, i8 %cl) {
-; CHECK-LABEL: shld16mrcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andb $15, %dl, %cl
-; CHECK-NEXT:    shldw %cl, %si, (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shld16mrcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    andb $15, %dl, %cl
+; NDD-NEXT:    shldw %cl, %si, %ax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shld16mrcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    andb $15, %dl, %cl
+; MEM-NEXT:    shldw %cl, %si, (%rdi), %ax
+; MEM-NEXT:    retq
 entry:
     %a = load i16, ptr %ptr
     %clin = sext i8 %cl to i16
@@ -124,12 +134,20 @@ entry:
 }
 
 define i32 @shld32mrcl(ptr %ptr, i32 noundef %b, i8 %cl) {
-; CHECK-LABEL: shld32mrcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shldl %cl, %esi, (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shld32mrcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %edx, %ecx
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shldl %cl, %esi, %eax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shld32mrcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %edx, %ecx
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shldl %cl, %esi, (%rdi), %eax
+; MEM-NEXT:    retq
 entry:
     %a = load i32, ptr %ptr
     %clin = sext i8 %cl to i32
@@ -138,12 +156,20 @@ entry:
 }
 
 define i64 @shld64mrcl(ptr %ptr, i64 noundef %b, i8 %cl) {
-; CHECK-LABEL: shld64mrcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shldq %cl, %rsi, (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shld64mrcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %edx, %ecx
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shldq %cl, %rsi, %rax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shld64mrcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %edx, %ecx
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shldq %cl, %rsi, (%rdi), %rax
+; MEM-NEXT:    retq
 entry:
     %a = load i64, ptr %ptr
     %clin = sext i8 %cl to i64
@@ -152,10 +178,16 @@ entry:
 }
 
 define i16 @shld16mri8(ptr %ptr, i16 noundef %b) {
-; CHECK-LABEL: shld16mri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldw $12, %si, (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shld16mri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    shldw $12, %si, %ax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shld16mri8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shldw $12, %si, (%rdi), %ax
+; MEM-NEXT:    retq
 entry:
     %a = load i16, ptr %ptr
     %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 12)
@@ -163,10 +195,16 @@ entry:
 }
 
 define i32 @shld32mri8(ptr %ptr, i32 noundef %b) {
-; CHECK-LABEL: shld32mri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldl $12, %esi, (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shld32mri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    shldl $12, %esi, %eax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shld32mri8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shldl $12, %esi, (%rdi), %eax
+; MEM-NEXT:    retq
 entry:
     %a = load i32, ptr %ptr
     %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 12)
@@ -174,10 +212,16 @@ entry:
 }
 
 define i64 @shld64mri8(ptr %ptr, i64 noundef %b) {
-; CHECK-LABEL: shld64mri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldq $12, %rsi, (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shld64mri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    shldq $12, %rsi, %rax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shld64mri8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shldq $12, %rsi, (%rdi), %rax
+; MEM-NEXT:    retq
 entry:
     %a = load i64, ptr %ptr
     %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 12)
diff --git a/llvm/test/CodeGen/X86/apx/shr.ll b/llvm/test/CodeGen/X86/apx/shr.ll
index b82000bd950dc..d0146b0641883 100644
--- a/llvm/test/CodeGen/X86/apx/shr.ll
+++ b/llvm/test/CodeGen/X86/apx/shr.ll
@@ -1,17 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
 define i8 @shr8m1(ptr %ptr) {
-; CHECK-LABEL: shr8m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr8m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    shrb %al # EVEX TO LEGACY Compression encoding: [0xd0,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr8m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr8m1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrb (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd0,0x2f]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    shrb %al # EVEX TO LEGACY Compression encoding: [0xd0,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
@@ -20,14 +30,21 @@ entry:
 }
 
 define i16 @shr16m1(ptr %ptr) {
-; CHECK-LABEL: shr16m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr16m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    shrw %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd1,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr16m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16m1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrw (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd1,0x2f]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shrw %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd1,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -36,14 +53,21 @@ entry:
 }
 
 define i32 @shr32m1(ptr %ptr) {
-; CHECK-LABEL: shr32m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr32m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr32m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr32m1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrl (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd1,0x2f]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
@@ -52,14 +76,21 @@ entry:
 }
 
 define i64 @shr64m1(ptr %ptr) {
-; CHECK-LABEL: shr64m1:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr64m1:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    shrq %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd1,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr64m1:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr64m1:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrq (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd1,0x2f]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    shrq %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd1,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
@@ -68,18 +99,27 @@ entry:
 }
 
 define i8 @shr8mcl(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: shr8mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr8mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shrb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr8mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr8mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f]
+; NF-NEXT:    shrb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
@@ -88,18 +128,27 @@ entry:
 }
 
 define i8 @shr8mcl_mask(ptr %ptr, i8 %cl) {
-; CHECK-LABEL: shr8mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr8mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shrb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr8mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr8mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shrb %cl, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd2,0x2f]
+; NF-NEXT:    shrb %cl, %al # EVEX TO LEGACY Compression encoding: [0xd2,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
@@ -109,18 +158,27 @@ entry:
 }
 
 define i16 @shr16mcl(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: shr16mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr16mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shrw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr16mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shrw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shrw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x2f]
+; NF-NEXT:    shrw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -129,18 +187,27 @@ entry:
 }
 
 define i16 @shr16mcl_mask(ptr %ptr, i16 %cl) {
-; CHECK-LABEL: shr16mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr16mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shrw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr16mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shrw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shrw %cl, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xd3,0x2f]
+; NF-NEXT:    shrw %cl, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xd3,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -150,18 +217,27 @@ entry:
 }
 
 define i32 @shr32mcl(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: shr32mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr32mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr32mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr32mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f]
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
@@ -170,18 +246,27 @@ entry:
 }
 
 define i32 @shr32mcl_mask(ptr %ptr, i32 %cl) {
-; CHECK-LABEL: shr32mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr32mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; NDD-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr32mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $ecx
+; MEM-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr32mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $ecx
-; NF-NEXT:    {nf} shrl %cl, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xd3,0x2f]
+; NF-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
@@ -191,18 +276,27 @@ entry:
 }
 
 define i64 @shr64mcl(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: shr64mcl:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr64mcl:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    shrq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr64mcl:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr64mcl:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
-; NF-NEXT:    {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f]
+; NF-NEXT:    shrq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
@@ -211,18 +305,27 @@ entry:
 }
 
 define i64 @shr64mcl_mask(ptr %ptr, i64 %cl) {
-; CHECK-LABEL: shr64mcl_mask:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
-; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
-; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr64mcl_mask:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    # kill: def $cl killed $cl killed $rcx
+; NDD-NEXT:    shrq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr64mcl_mask:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; MEM-NEXT:    # kill: def $cl killed $cl killed $rcx
+; MEM-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr64mcl_mask:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
 ; NF-NEXT:    # kill: def $cl killed $cl killed $rcx
-; NF-NEXT:    {nf} shrq %cl, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xd3,0x2f]
+; NF-NEXT:    shrq %cl, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xd3,0xe8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
@@ -232,14 +335,21 @@ entry:
 }
 
 define i8 @shr8mi(ptr %ptr) {
-; CHECK-LABEL: shr8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x2f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    shrb $4, %al # EVEX TO LEGACY Compression encoding: [0xc0,0xe8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x2f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr8mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrb $4, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc0,0x2f,0x04]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    shrb $4, %al # EVEX TO LEGACY Compression encoding: [0xc0,0xe8,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i8, ptr %ptr
@@ -248,14 +358,21 @@ entry:
 }
 
 define i16 @shr16mi(ptr %ptr) {
-; CHECK-LABEL: shr16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x2f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    shrw $4, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xc1,0xe8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrw $4, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x2f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrw $4, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0xc1,0x2f,0x04]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    shrw $4, %ax # EVEX TO LEGACY Compression encoding: [0x66,0xc1,0xe8,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i16, ptr %ptr
@@ -264,14 +381,21 @@ entry:
 }
 
 define i32 @shr32mi(ptr %ptr) {
-; CHECK-LABEL: shr32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x2f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x2f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr32mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrl $4, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0xc1,0x2f,0x04]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i32, ptr %ptr
@@ -280,14 +404,21 @@ entry:
 }
 
 define i64 @shr64mi(ptr %ptr) {
-; CHECK-LABEL: shr64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x2f,0x04]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: shr64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    shrq $4, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe8,0x04]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: shr64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shrq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x2f,0x04]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: shr64mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} shrq $4, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0xc1,0x2f,0x04]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    shrq $4, %rax # EVEX TO LEGACY Compression encoding: [0x48,0xc1,0xe8,0x04]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %a = load i64, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/shrd.ll b/llvm/test/CodeGen/X86/apx/shrd.ll
index 3eaa06b123bd0..56def4116c47b 100644
--- a/llvm/test/CodeGen/X86/apx/shrd.ll
+++ b/llvm/test/CodeGen/X86/apx/shrd.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,MEM
 
 define i16 @shrd16rrcl(i16 noundef %a, i16 noundef %b, i8 %cl) {
 ; CHECK-LABEL: shrd16rrcl:
@@ -155,10 +158,16 @@ entry:
 }
 
 define i16 @shrd16mri8(ptr %ptr, i16 noundef %b) {
-; CHECK-LABEL: shrd16mri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldw $4, %si, (%rdi), %ax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shrd16mri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax
+; NDD-NEXT:    shldw $4, %si, %ax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shrd16mri8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shldw $4, %si, (%rdi), %ax
+; MEM-NEXT:    retq
 entry:
     %a = load i16, ptr %ptr
     %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 12)
@@ -166,10 +175,16 @@ entry:
 }
 
 define i32 @shrd32mri8(ptr %ptr, i32 noundef %b) {
-; CHECK-LABEL: shrd32mri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldl $20, %esi, (%rdi), %eax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shrd32mri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax
+; NDD-NEXT:    shldl $20, %esi, %eax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shrd32mri8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shldl $20, %esi, (%rdi), %eax
+; MEM-NEXT:    retq
 entry:
     %a = load i32, ptr %ptr
     %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 12)
@@ -177,10 +192,16 @@ entry:
 }
 
 define i64 @shrd64mri8(ptr %ptr, i64 noundef %b) {
-; CHECK-LABEL: shrd64mri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shldq $52, %rsi, (%rdi), %rax
-; CHECK-NEXT:    retq
+; NDD-LABEL: shrd64mri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax
+; NDD-NEXT:    shldq $52, %rsi, %rax
+; NDD-NEXT:    retq
+;
+; MEM-LABEL: shrd64mri8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    shldq $52, %rsi, (%rdi), %rax
+; MEM-NEXT:    retq
 entry:
     %a = load i64, ptr %ptr
     %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 12)
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index 75ee8cf31dee5..fa9b913132415 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,MEM,BOTH
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
@@ -64,14 +67,27 @@ entry:
 }
 
 define i8 @sub8rm(i8 noundef %a, ptr %ptr) {
-; CHECK-LABEL: sub8rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x2a,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub8rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    subb %al, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub8rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    subb %al, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xc7]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub8rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x2a,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub8rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2a,0x3e]
+; NF-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NF-NEXT:    {nf} subb %al, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i8, ptr %ptr
@@ -80,14 +96,27 @@ entry:
 }
 
 define i16 @sub16rm(i16 noundef %a, ptr %ptr) {
-; CHECK-LABEL: sub16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x2b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    subw %ax, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub16rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    subw %ax, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xc7]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x2b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x2b,0x3e]
+; NF-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NF-NEXT:    {nf} subw %ax, %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x29,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i16, ptr %ptr
@@ -96,14 +125,27 @@ entry:
 }
 
 define i32 @sub32rm(i32 noundef %a, ptr %ptr) {
-; CHECK-LABEL: sub32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x2b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    subl %eax, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub32rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    subl %eax, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xc7]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x2b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x2b,0x3e]
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    {nf} subl %eax, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i32, ptr %ptr
@@ -112,14 +154,27 @@ entry:
 }
 
 define i64 @sub64rm(i64 noundef %a, ptr %ptr) {
-; CHECK-LABEL: sub64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x2b,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    subq %rax, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub64rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    subq %rax, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc7]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x2b,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x2b,0x3e]
+; NF-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NF-NEXT:    {nf} subq %rax, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %b = load i64, ptr %ptr
@@ -158,14 +213,24 @@ entry:
 }
 
 define i64 @sub64ri8(i64 noundef %a) {
-; CHECK-LABEL: sub64ri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $-128, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xef,0x80]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub64ri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    leaq 128(%rdi), %rax # encoding: [0x48,0x8d,0x87,0x80,0x00,0x00,0x00]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: sub64ri8:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    subq $-128, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0xef,0x80]
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: sub64ri8:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    leaq 128(%rdi), %rax # encoding: [0x48,0x8d,0x87,0x80,0x00,0x00,0x00]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub64ri8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subq $-128, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0xef,0x80]
+; NF-NEXT:    leaq 128(%rdi), %rax # encoding: [0x48,0x8d,0x87,0x80,0x00,0x00,0x00]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, -128
@@ -220,16 +285,31 @@ entry:
 }
 
 define i64 @sub64ri(i64 noundef %a) {
-; CHECK-LABEL: sub64ri:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $-2147483648, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xef,0x00,0x00,0x00,0x80]
-; CHECK-NEXT:    # imm = 0x80000000
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub64ri:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl $2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80]
+; NDD-NEXT:    # imm = 0x80000000
+; NDD-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: sub64ri:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    subq $-2147483648, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xef,0x00,0x00,0x00,0x80]
+; IMM-NEXT:    # imm = 0x80000000
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: sub64ri:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    movl $2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80]
+; MEMONLY-NEXT:    # imm = 0x80000000
+; MEMONLY-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub64ri:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subq $-2147483648, %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0xef,0x00,0x00,0x00,0x80]
+; NF-NEXT:    movl $2147483648, %eax # encoding: [0xb8,0x00,0x00,0x00,0x80]
 ; NF-NEXT:    # imm = 0x80000000
+; NF-NEXT:    addq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %sub = sub i64 %a, -2147483648
@@ -237,14 +317,27 @@ entry:
 }
 
 define i8 @sub8mr(ptr %a, i8 noundef %b) {
-; CHECK-LABEL: sub8mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub8mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    subb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x28,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub8mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    subb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x28,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub8mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subb %sil, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub8mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subb %sil, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x28,0x37]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    subb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x28,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -253,14 +346,27 @@ entry:
 }
 
 define i16 @sub16mr(ptr %a, i16 noundef %b) {
-; CHECK-LABEL: sub16mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub16mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    subw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x29,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub16mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    subw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x29,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub16mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subw %si, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subw %si, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x29,0x37]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    subw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x29,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -269,14 +375,27 @@ entry:
 }
 
 define i32 @sub32mr(ptr %a, i32 noundef %b) {
-; CHECK-LABEL: sub32mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub32mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub32mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub32mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subl %esi, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub32mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subl %esi, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x29,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    subl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x29,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -285,14 +404,27 @@ entry:
 }
 
 define i64 @sub64mr(ptr %a, i64 noundef %b) {
-; CHECK-LABEL: sub64mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub64mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    subq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x29,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub64mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    subq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x29,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub64mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subq %rsi, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub64mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subq %rsi, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x29,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    subq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x29,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -301,14 +433,27 @@ entry:
 }
 
 define i16 @sub16mi8(ptr %a) {
-; CHECK-LABEL: sub16mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subw $-128, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x2f,0x80]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub16mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    subw $-128, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xe8,0x80]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub16mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    subw $-128, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xe8,0x80]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub16mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subw $-128, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x2f,0x80]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subw $-128, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x2f,0x80]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    subw $-128, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xe8,0x80]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -317,14 +462,27 @@ entry:
 }
 
 define i32 @sub32mi8(ptr %a) {
-; CHECK-LABEL: sub32mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subl $-128, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x2f,0x80]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub32mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub32mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub32mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subl $-128, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x2f,0x80]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub32mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subl $-128, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x2f,0x80]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    subl $-128, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xe8,0x80]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -333,14 +491,33 @@ entry:
 }
 
 define i64 @sub64mi8(ptr %a) {
-; CHECK-LABEL: sub64mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $-128, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x2f,0x80]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub64mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    subq $-128, %rax # encoding: [0x48,0x83,0xe8,0x80]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub64mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    subq $-128, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xe8,0x80]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: sub64mi8:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; MEMONLY-NEXT:    subq $-128, %rax # encoding: [0x48,0x83,0xe8,0x80]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; BOTH-LABEL: sub64mi8:
+; BOTH:       # %bb.0: # %entry
+; BOTH-NEXT:    subq $-128, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x2f,0x80]
+; BOTH-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub64mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subq $-128, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x2f,0x80]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    subq $-128, %rax # encoding: [0x48,0x83,0xe8,0x80]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -349,14 +526,27 @@ entry:
 }
 
 define i8 @sub8mi(ptr %a) {
-; CHECK-LABEL: sub8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addb $-123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x85]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub8mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addb $-123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x07,0x85]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub8mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addb $-123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x07,0x85]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -365,15 +555,30 @@ entry:
 }
 
 define i16 @sub16mi(ptr %a) {
-; CHECK-LABEL: sub16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addw $-1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x07,0x2e,0xfb]
-; CHECK-NEXT:    # imm = 0xFB2E
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
+; NDD-NEXT:    # imm = 0xFB2E
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub16mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
+; IMMONLY-NEXT:    # imm = 0xFB2E
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addw $-1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x07,0x2e,0xfb]
+; MEM-NEXT:    # imm = 0xFB2E
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addw $-1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x07,0x2e,0xfb]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    addw $-1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x05,0x2e,0xfb]
 ; NF-NEXT:    # imm = 0xFB2E
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -383,15 +588,30 @@ entry:
 }
 
 define i32 @sub32mi(ptr %a) {
-; CHECK-LABEL: sub32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    addl $-123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0xc0,0x1d,0xfe,0xff]
-; CHECK-NEXT:    # imm = 0xFFFE1DC0
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
+; NDD-NEXT:    # imm = 0xFFFE1DC0
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub32mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
+; IMMONLY-NEXT:    # imm = 0xFFFE1DC0
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    addl $-123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; MEM-NEXT:    # imm = 0xFFFE1DC0
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub32mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} addl $-123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x07,0xc0,0x1d,0xfe,0xff]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
 ; NF-NEXT:    # imm = 0xFFFE1DC0
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -401,16 +621,33 @@ entry:
 }
 
 define i64 @sub64mi(ptr %a) {
-; CHECK-LABEL: sub64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $-2147483648, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x2f,0x00,0x00,0x00,0x80]
-; CHECK-NEXT:    # imm = 0x80000000
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: sub64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    movl $2147483648, %ecx # encoding: [0xb9,0x00,0x00,0x00,0x80]
+; NDD-NEXT:    # imm = 0x80000000
+; NDD-NEXT:    addq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: sub64mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    subq $-2147483648, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x2d,0x00,0x00,0x00,0x80]
+; IMMONLY-NEXT:    # imm = 0x80000000
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: sub64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    subq $-2147483648, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x2f,0x00,0x00,0x00,0x80]
+; MEM-NEXT:    # imm = 0x80000000
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: sub64mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} subq $-2147483648, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x2f,0x00,0x00,0x00,0x80]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    movl $2147483648, %ecx # encoding: [0xb9,0x00,0x00,0x00,0x80]
 ; NF-NEXT:    # imm = 0x80000000
+; NF-NEXT:    addq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -506,21 +743,42 @@ entry:
 }
 
 define i8 @subflag8rm(i8 noundef %a, ptr %b) {
-; CHECK-LABEL: subflag8rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e]
-; CHECK-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; CHECK-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: subflag8rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NDD-NEXT:    subb %al, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xc7]
+; NDD-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NDD-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NDD-NEXT:    # kill: def $al killed $al killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: subflag8rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; IMMONLY-NEXT:    subb %al, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xc7]
+; IMMONLY-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; IMMONLY-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    # kill: def $al killed $al killed $eax
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: subflag8rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; MEM-NEXT:    subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e]
+; MEM-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
+; MEM-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; MEM-NEXT:    # kill: def $al killed $al killed $eax
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: subflag8rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NF-NEXT:    subb (%rsi), %dil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x2a,0x3e]
-; NF-NEXT:    movzbl %cl, %ecx # encoding: [0x0f,0xb6,0xc9]
-; NF-NEXT:    cmovael %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc1]
+; NF-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NF-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NF-NEXT:    subb %al, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xc7]
+; NF-NEXT:    movzbl %al, %eax # encoding: [0x0f,0xb6,0xc0]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; NF-NEXT:    # kill: def $al killed $al killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -530,19 +788,38 @@ entry:
 }
 
 define i16 @subflag16rm(i16 noundef %a, ptr %b) {
-; CHECK-LABEL: subflag16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    subw (%rsi), %di # EVEX TO LEGACY Compression encoding: [0x66,0x2b,0x3e]
-; CHECK-NEXT:    cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7]
-; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: subflag16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NDD-NEXT:    subw %ax, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xc7]
+; NDD-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NDD-NEXT:    # kill: def $ax killed $ax killed $eax
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: subflag16rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; IMMONLY-NEXT:    subw %ax, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xc7]
+; IMMONLY-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    # kill: def $ax killed $ax killed $eax
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: subflag16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; MEM-NEXT:    subw (%rsi), %di # EVEX TO LEGACY Compression encoding: [0x66,0x2b,0x3e]
+; MEM-NEXT:    cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7]
+; MEM-NEXT:    # kill: def $ax killed $ax killed $eax
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: subflag16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NF-NEXT:    subw (%rsi), %di # EVEX TO LEGACY Compression encoding: [0x66,0x2b,0x3e]
-; NF-NEXT:    cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7]
+; NF-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NF-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NF-NEXT:    subw %ax, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xc7]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; NF-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -552,18 +829,35 @@ entry:
 }
 
 define i32 @subflag32rm(i32 noundef %a, ptr %b) {
-; CHECK-LABEL: subflag32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    subl (%rsi), %edi # EVEX TO LEGACY Compression encoding: [0x2b,0x3e]
-; CHECK-NEXT:    cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: subflag32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NDD-NEXT:    subl %eax, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xc7]
+; NDD-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: subflag32rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; IMMONLY-NEXT:    subl %eax, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xc7]
+; IMMONLY-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: subflag32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; MEM-NEXT:    subl (%rsi), %edi # EVEX TO LEGACY Compression encoding: [0x2b,0x3e]
+; MEM-NEXT:    cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: subflag32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NF-NEXT:    subl (%rsi), %edi # EVEX TO LEGACY Compression encoding: [0x2b,0x3e]
-; NF-NEXT:    cmovael %edi, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x43,0xc7]
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NF-NEXT:    subl %eax, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xc7]
+; NF-NEXT:    cmovbl %ecx, %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x42,0xc1]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -572,18 +866,35 @@ entry:
 }
 
 define i64 @subflag64rm(i64 noundef %a, ptr %b) {
-; CHECK-LABEL: subflag64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    subq (%rsi), %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x2b,0x3e]
-; CHECK-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: subflag64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NDD-NEXT:    subq %rax, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc7]
+; NDD-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: subflag64rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; IMMONLY-NEXT:    subq %rax, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc7]
+; IMMONLY-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: subflag64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; MEM-NEXT:    subq (%rsi), %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x2b,0x3e]
+; MEM-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: subflag64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NF-NEXT:    subq (%rsi), %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x2b,0x3e]
-; NF-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; NF-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NF-NEXT:    xorl %ecx, %ecx # encoding: [0x31,0xc9]
+; NF-NEXT:    subq %rax, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xc7]
+; NF-NEXT:    cmovbq %rcx, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x42,0xc1]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -632,17 +943,31 @@ entry:
 }
 
 define i64 @subflag64ri8(i64 noundef %a) {
-; CHECK-LABEL: subflag64ri8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    subq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xef,0x7b]
-; CHECK-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: subflag64ri8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    subq $123, %rdi # encoding: [0x48,0x83,0xef,0x7b]
+; NDD-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: subflag64ri8:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; IMM-NEXT:    subq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xef,0x7b]
+; IMM-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: subflag64ri8:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; MEMONLY-NEXT:    subq $123, %rdi # encoding: [0x48,0x83,0xef,0x7b]
+; MEMONLY-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: subflag64ri8:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NF-NEXT:    subq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xef,0x7b]
+; NF-NEXT:    subq $123, %rdi # encoding: [0x48,0x83,0xef,0x7b]
 ; NF-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -718,18 +1043,34 @@ entry:
 }
 
 define i64 @subflag64ri(i64 noundef %a) {
-; CHECK-LABEL: subflag64ri:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; CHECK-NEXT:    subq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: subflag64ri:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; NDD-NEXT:    subq $123456, %rdi # encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: subflag64ri:
+; IMM:       # %bb.0: # %entry
+; IMM-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; IMM-NEXT:    subq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00]
+; IMM-NEXT:    # imm = 0x1E240
+; IMM-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: subflag64ri:
+; MEMONLY:       # %bb.0: # %entry
+; MEMONLY-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; MEMONLY-NEXT:    subq $123456, %rdi # encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00]
+; MEMONLY-NEXT:    # imm = 0x1E240
+; MEMONLY-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: subflag64ri:
 ; NF:       # %bb.0: # %entry
 ; NF-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
-; NF-NEXT:    subq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    subq $123456, %rdi # encoding: [0x48,0x81,0xef,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    cmovaeq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x43,0xc7]
 ; NF-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll
index e953902aff7ea..18e47149e9aba 100644
--- a/llvm/test/CodeGen/X86/apx/xor.ll
+++ b/llvm/test/CodeGen/X86/apx/xor.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NDD
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,IMMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,MEM,MEMONLY
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,+prefer-ndd-imm,+prefer-ndd-mem -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,IMM,MEM
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd,nf -x86-enable-apx-for-relocation=true -verify-machineinstrs --show-mc-encoding | FileCheck --check-prefix=NF %s
 
@@ -68,14 +71,27 @@ entry:
 }
 
 define i8 @xor8rm(i8 noundef %a, ptr %b) {
-; CHECK-LABEL: xor8rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor8rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NDD-NEXT:    xorb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor8rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; IMMONLY-NEXT:    xorb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor8rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor8rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorb (%rsi), %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x3e]
+; NF-NEXT:    movzbl (%rsi), %eax # encoding: [0x0f,0xb6,0x06]
+; NF-NEXT:    xorb %dil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i8, ptr %b
@@ -84,14 +100,27 @@ entry:
 }
 
 define i16 @xor16rm(i16 noundef %a, ptr %b) {
-; CHECK-LABEL: xor16rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor16rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NDD-NEXT:    xorw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor16rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; IMMONLY-NEXT:    xorw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor16rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorw (%rsi), %di, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x3e]
+; NF-NEXT:    movzwl (%rsi), %eax # encoding: [0x0f,0xb7,0x06]
+; NF-NEXT:    xorw %di, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i16, ptr %b
@@ -100,14 +129,27 @@ entry:
 }
 
 define i32 @xor32rm(i32 noundef %a, ptr %b) {
-; CHECK-LABEL: xor32rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x33,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor32rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NDD-NEXT:    xorl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x31,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor32rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; IMMONLY-NEXT:    xorl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x31,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor32rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x33,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor32rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl (%rsi), %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x33,0x3e]
+; NF-NEXT:    movl (%rsi), %eax # encoding: [0x8b,0x06]
+; NF-NEXT:    xorl %edi, %eax # EVEX TO LEGACY Compression encoding: [0x31,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i32, ptr %b
@@ -116,14 +158,27 @@ entry:
 }
 
 define i64 @xor64rm(i64 noundef %a, ptr %b) {
-; CHECK-LABEL: xor64rm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x33,0x3e]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor64rm:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NDD-NEXT:    xorq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xf8]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor64rm:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; IMMONLY-NEXT:    xorq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xf8]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor64rm:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x33,0x3e]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor64rm:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorq (%rsi), %rdi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x33,0x3e]
+; NF-NEXT:    movq (%rsi), %rax # encoding: [0x48,0x8b,0x06]
+; NF-NEXT:    xorq %rdi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xf8]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %t = load i64, ptr %b
@@ -243,14 +298,27 @@ entry:
 }
 
 define i8 @xor8mr(ptr %a, i8 noundef %b) {
-; CHECK-LABEL: xor8mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor8mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    xorb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor8mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    xorb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor8mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor8mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x37]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    xorb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -259,14 +327,27 @@ entry:
 }
 
 define i16 @xor16mr(ptr %a, i16 noundef %b) {
-; CHECK-LABEL: xor16mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor16mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    xorw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor16mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    xorw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor16mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x37]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -275,14 +356,27 @@ entry:
 }
 
 define i32 @xor32mr(ptr %a, i32 noundef %b) {
-; CHECK-LABEL: xor32mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x33,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor32mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    xorl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x31,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor32mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    xorl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x31,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor32mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl (%rdi), %esi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x33,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor32mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl (%rdi), %esi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x33,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    xorl %esi, %eax # EVEX TO LEGACY Compression encoding: [0x31,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -291,14 +385,27 @@ entry:
 }
 
 define i64 @xor64mr(ptr %a, i64 noundef %b) {
-; CHECK-LABEL: xor64mr:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x33,0x37]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor64mr:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    xorq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xf0]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor64mr:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    xorq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xf0]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor64mr:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorq (%rdi), %rsi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x33,0x37]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor64mr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorq (%rdi), %rsi, %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x33,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    xorq %rsi, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xf0]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -307,14 +414,27 @@ entry:
 }
 
 define i16 @xor16mi8(ptr %a) {
-; CHECK-LABEL: xor16mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x37,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor16mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    xorw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xf0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor16mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    xorw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xf0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor16mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorw $123, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x83,0x37,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorw $123, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x83,0x37,0x7b]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorw $123, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x83,0xf0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i16, ptr %a
@@ -323,14 +443,27 @@ entry:
 }
 
 define i32 @xor32mi8(ptr %a) {
-; CHECK-LABEL: xor32mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x37,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor32mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor32mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor32mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x83,0x37,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor32mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl $123, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x83,0x37,0x7b]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    xorl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xf0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i32, ptr %a
@@ -339,14 +472,27 @@ entry:
 }
 
 define i64 @xor64mi8(ptr %a) {
-; CHECK-LABEL: xor64mi8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x37,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor64mi8:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    xorq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf0,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor64mi8:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    xorq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf0,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor64mi8:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x83,0x37,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor64mi8:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorq $123, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x83,0x37,0x7b]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    xorq $123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf0,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i64, ptr %a
@@ -355,14 +501,27 @@ entry:
 }
 
 define i8 @xor8mi(ptr %a) {
-; CHECK-LABEL: xor8mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x37,0x7b]
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor8mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    xorb $123, %al # EVEX TO LEGACY Compression encoding: [0x34,0x7b]
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor8mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    xorb $123, %al # EVEX TO LEGACY Compression encoding: [0x34,0x7b]
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor8mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x37,0x7b]
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor8mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorb $123, (%rdi), %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x80,0x37,0x7b]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    xorb $123, %al # EVEX TO LEGACY Compression encoding: [0x34,0x7b]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
   %t= load i8, ptr %a
@@ -371,15 +530,30 @@ entry:
 }
 
 define i16 @xor16mi(ptr %a) {
-; CHECK-LABEL: xor16mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x37,0xd2,0x04]
-; CHECK-NEXT:    # imm = 0x4D2
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor16mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    xorw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x35,0xd2,0x04]
+; NDD-NEXT:    # imm = 0x4D2
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor16mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    xorw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x35,0xd2,0x04]
+; IMMONLY-NEXT:    # imm = 0x4D2
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor16mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorw $1234, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x37,0xd2,0x04]
+; MEM-NEXT:    # imm = 0x4D2
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor16mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorw $1234, (%rdi), %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x81,0x37,0xd2,0x04]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorw $1234, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x35,0xd2,0x04]
 ; NF-NEXT:    # imm = 0x4D2
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -389,15 +563,30 @@ entry:
 }
 
 define i32 @xor32mi(ptr %a) {
-; CHECK-LABEL: xor32mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor32mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    xorl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x35,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor32mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    xorl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x35,0x40,0xe2,0x01,0x00]
+; IMMONLY-NEXT:    # imm = 0x1E240
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor32mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor32mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl $123456, (%rdi), %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    xorl $123456, %eax # EVEX TO LEGACY Compression encoding: [0x35,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -407,15 +596,30 @@ entry:
 }
 
 define i64 @xor64mi(ptr %a) {
-; CHECK-LABEL: xor64mi:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xor64mi:
+; NDD:       # %bb.0: # %entry
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    xorq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x35,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xor64mi:
+; IMMONLY:       # %bb.0: # %entry
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    xorq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x35,0x40,0xe2,0x01,0x00]
+; IMMONLY-NEXT:    # imm = 0x1E240
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xor64mi:
+; MEM:       # %bb.0: # %entry
+; MEM-NEXT:    xorq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x37,0x40,0xe2,0x01,0x00]
+; MEM-NEXT:    # imm = 0x1E240
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor64mi:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorq $123456, (%rdi), %rax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0xfc,0x1c,0x81,0x37,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    xorq $123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x35,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
@@ -521,18 +725,39 @@ define i1 @xorflag64rr(i64 %a, i64 %b) {
 }
 
 define i1 @xorflag8rm(ptr %ptr, i8 %b) {
-; CHECK-LABEL: xorflag8rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x37]
-; CHECK-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xorflag8rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NDD-NEXT:    xorb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf0]
+; NDD-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xorflag8rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; IMMONLY-NEXT:    xorb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf0]
+; IMMONLY-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xorflag8rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    xorb (%rdi), %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x32,0x37]
+; MEM-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xorflag8rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    {nf} xorb (%rdi), %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x32,0x37]
+; NF-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; NF-NEXT:    xorb %sil, %al # EVEX TO LEGACY Compression encoding: [0x40,0x30,0xf0]
 ; NF-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
@@ -547,18 +772,39 @@ define i1 @xorflag8rm(ptr %ptr, i8 %b) {
 }
 
 define i1 @xorflag16rm(ptr %ptr, i16 %b) {
-; CHECK-LABEL: xorflag16rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x37]
-; CHECK-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xorflag16rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NDD-NEXT:    xorw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf0]
+; NDD-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xorflag16rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; IMMONLY-NEXT:    xorw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf0]
+; IMMONLY-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xorflag16rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    xorw (%rdi), %si, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x33,0x37]
+; MEM-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xorflag16rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    {nf} xorw (%rdi), %si, %ax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7d,0x1c,0x33,0x37]
+; NF-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; NF-NEXT:    xorw %si, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x31,0xf0]
 ; NF-NEXT:    xorw $-1, %ax, %cx # encoding: [0x62,0xf4,0x75,0x18,0x83,0xf0,0xff]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movw %cx, d64(%rip) # encoding: [0x66,0x89,0x0d,A,A,A,A]
@@ -573,17 +819,36 @@ define i1 @xorflag16rm(ptr %ptr, i16 %b) {
 }
 
 define i1 @xorflag32rm(ptr %ptr, i32 %b) {
-; CHECK-LABEL: xorflag32rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x33,0x37]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xorflag32rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NDD-NEXT:    xorl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xc6]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xorflag32rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; IMMONLY-NEXT:    xorl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xc6]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xorflag32rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    xorl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x33,0x37]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xorflag32rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    xorl (%rdi), %esi # EVEX TO LEGACY Compression encoding: [0x33,0x37]
+; NF-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; NF-NEXT:    xorl %eax, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xc6]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movl %esi, d64(%rip) # encoding: [0x89,0x35,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: d64, kind: reloc_riprel_4byte
@@ -596,17 +861,36 @@ define i1 @xorflag32rm(ptr %ptr, i32 %b) {
 }
 
 define i1 @xorflag64rm(ptr %ptr, i64 %b) {
-; CHECK-LABEL: xorflag64rm:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x33,0x37]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xorflag64rm:
+; NDD:       # %bb.0:
+; NDD-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NDD-NEXT:    xorq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xc6]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMMONLY-LABEL: xorflag64rm:
+; IMMONLY:       # %bb.0:
+; IMMONLY-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; IMMONLY-NEXT:    xorq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xc6]
+; IMMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMMONLY-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; IMMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMMONLY-NEXT:    retq # encoding: [0xc3]
+;
+; MEM-LABEL: xorflag64rm:
+; MEM:       # %bb.0:
+; MEM-NEXT:    xorq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x33,0x37]
+; MEM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEM-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
+; MEM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEM-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xorflag64rm:
 ; NF:       # %bb.0:
-; NF-NEXT:    xorq (%rdi), %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x33,0x37]
+; NF-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; NF-NEXT:    xorq %rax, %rsi # EVEX TO LEGACY Compression encoding: [0x48,0x31,0xc6]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rsi, d64(%rip) # encoding: [0x48,0x89,0x35,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
@@ -691,18 +975,36 @@ define i1 @xorflag32ri(i32 %a) {
 }
 
 define i1 @xorflag64ri(i64 %a) {
-; CHECK-LABEL: xorflag64ri:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00]
-; CHECK-NEXT:    # imm = 0x1E240
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xorflag64ri:
+; NDD:       # %bb.0:
+; NDD-NEXT:    xorq $123456, %rdi # encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NDD-NEXT:    # imm = 0x1E240
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: xorflag64ri:
+; IMM:       # %bb.0:
+; IMM-NEXT:    xorq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; IMM-NEXT:    # imm = 0x1E240
+; IMM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMM-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; IMM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: xorflag64ri:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    xorq $123456, %rdi # encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; MEMONLY-NEXT:    # imm = 0x1E240
+; MEMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEMONLY-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; MEMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xorflag64ri:
 ; NF:       # %bb.0:
-; NF-NEXT:    xorq $123456, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    xorq $123456, %rdi # encoding: [0x48,0x81,0xf7,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
@@ -760,17 +1062,33 @@ define i1 @xorflag32ri8(i32 %a) {
 }
 
 define i1 @xorflag64ri8(i64 %a) {
-; CHECK-LABEL: xorflag64ri8:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf7,0x7b]
-; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
-; CHECK-NEXT:    retq # encoding: [0xc3]
+; NDD-LABEL: xorflag64ri8:
+; NDD:       # %bb.0:
+; NDD-NEXT:    xorq $123, %rdi # encoding: [0x48,0x83,0xf7,0x7b]
+; NDD-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; NDD-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; NDD-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; NDD-NEXT:    retq # encoding: [0xc3]
+;
+; IMM-LABEL: xorflag64ri8:
+; IMM:       # %bb.0:
+; IMM-NEXT:    xorq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf7,0x7b]
+; IMM-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; IMM-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; IMM-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; IMM-NEXT:    retq # encoding: [0xc3]
+;
+; MEMONLY-LABEL: xorflag64ri8:
+; MEMONLY:       # %bb.0:
+; MEMONLY-NEXT:    xorq $123, %rdi # encoding: [0x48,0x83,0xf7,0x7b]
+; MEMONLY-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
+; MEMONLY-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
+; MEMONLY-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
+; MEMONLY-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xorflag64ri8:
 ; NF:       # %bb.0:
-; NF-NEXT:    xorq $123, %rdi # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xf7,0x7b]
+; NF-NEXT:    xorq $123, %rdi # encoding: [0x48,0x83,0xf7,0x7b]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movq %rdi, d64(%rip) # encoding: [0x48,0x89,0x3d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 3, value: d64, kind: reloc_riprel_4byte
diff --git a/llvm/test/CodeGen/X86/asm-modifier-error.ll b/llvm/test/CodeGen/X86/asm-modifier-error.ll
index bea96a8253ddc..3f193dca5c804 100644
--- a/llvm/test/CodeGen/X86/asm-modifier-error.ll
+++ b/llvm/test/CodeGen/X86/asm-modifier-error.ll
@@ -7,6 +7,12 @@ entry:
   ret void
 }
 
+; CHECK: error: invalid operand in inline asm: '#TEST $0'
+define void @test_p_no_modifier(ptr %p) {
+  call void asm sideeffect "#TEST $0", "p,~{dirflag},~{fpsr},~{flags}"(ptr %p)
+  ret void
+}
+
 ; CHECK: error: invalid operand in inline asm: '#TEST ${0:a}'
 define void @test_a_m(ptr %p) {
   call void asm sideeffect "#TEST ${0:a}", "*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) %p)
diff --git a/llvm/test/CodeGen/X86/avx2-masked-gather.ll b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
index 169c0313d6a45..2429536bdb15f 100644
--- a/llvm/test/CodeGen/X86/avx2-masked-gather.ll
+++ b/llvm/test/CodeGen/X86/avx2-masked-gather.ll
@@ -9,8 +9,7 @@ declare <2 x i32> @llvm.masked.gather.v2i32(<2 x ptr> %ptrs, i32 %align, <2 x i1
 define <2 x i32> @masked_gather_v2i32(ptr %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
 ; X86-LABEL: masked_gather_v2i32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
@@ -59,8 +58,7 @@ entry:
 define <4 x i32> @masked_gather_v2i32_concat(ptr %ptr, <2 x i1> %masks, <2 x i32> %passthro) {
 ; X86-LABEL: masked_gather_v2i32_concat:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
@@ -112,8 +110,7 @@ declare <2 x float> @llvm.masked.gather.v2float(<2 x ptr> %ptrs, i32 %align, <2
 define <2 x float> @masked_gather_v2float(ptr %ptr, <2 x i1> %masks, <2 x float> %passthro) {
 ; X86-LABEL: masked_gather_v2float:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
@@ -163,8 +160,7 @@ entry:
 define <4 x float> @masked_gather_v2float_concat(ptr %ptr, <2 x i1> %masks, <2 x float> %passthro) {
 ; X86-LABEL: masked_gather_v2float_concat:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[2,3]
+; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; X86-NEXT:    vpslld $31, %xmm0, %xmm0
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
diff --git a/llvm/test/CodeGen/X86/byte-constants.ll b/llvm/test/CodeGen/X86/byte-constants.ll
new file mode 100644
index 0000000000000..ca0a6e974b512
--- /dev/null
+++ b/llvm/test/CodeGen/X86/byte-constants.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=x86_64-linux < %s | FileCheck %s
+
+; CHECK:	.globl x
+; CHECK: x:
+; CHECK: .quad	10
+
+ at x = global b64 bitcast (i64 10 to b64)
+
+; CHECK:	.globl b
+; CHECK: b:
+; CHECK: .byte	1
+
+ at b = global b1 1
+
+; CHECK:	.globl f
+; CHECK: f:
+; CHECK: .byte	31
+
+ at f = global b5 31
+
+; CHECK:	.globl r
+; CHECK: r:
+; CHECK: .long	42
+
+ at r = global b32 42
+
+; CHECK:	.globl w
+; CHECK: w:
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+
+ at w = global b128 -1
+
+; CHECK:	.globl uw
+; CHECK: uw:
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+
+ at uw = global b256 -1
+
+; CHECK:	.globl v
+; CHECK: v:
+; CHECK: .byte	1
+; CHECK: .byte	2
+; CHECK: .byte	3
+; CHECK: .byte	4
+
+ at v = global <4 x b8> <b8 1, b8 2, b8 3, b8 4>
+
+; CHECK:	.globl uv
+; CHECK: uv:
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+; CHECK: .quad	-1
+
+ at uv = global <4 x b128> <b128 -1, b128 -1, b128 -1, b128 -1>
diff --git a/llvm/test/CodeGen/X86/cmp.ll b/llvm/test/CodeGen/X86/cmp.ll
index ed3f0e0f0aa71..86610cc052ade 100644
--- a/llvm/test/CodeGen/X86/cmp.ll
+++ b/llvm/test/CodeGen/X86/cmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -show-mc-encoding | FileCheck %s --check-prefixes=CHECK,NO-NDD
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ndd -show-mc-encoding | FileCheck --check-prefixes=CHECK,NDD %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ndd,+prefer-ndd-imm -show-mc-encoding | FileCheck --check-prefixes=CHECK,NDD %s
 
 @d = dso_local global i8 0, align 1
 @d64 = dso_local global i64 0
diff --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index ac78136b9d8ea..f15697aaf2df2 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -730,9 +730,9 @@ define i64 @urem_i64_12(i64 %x) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    shldl $30, %esi, %ecx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shrdl $2, %ecx, %eax
+; X86-NEXT:    shrl $2, %ecx
 ; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    adcl $0, %ecx
 ; X86-NEXT:    movl $-1431655765, %edx # imm = 0xAAAAAAAB
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 3796dd796eaf9..af37be791d27f 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,8 +425,8 @@ entry:
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    movq %rsi, %rcx
-; X86-64-NEXT:    shldq $62, %rdi, %rcx
+; X86-64-NEXT:    movq %rdi, %rcx
+; X86-64-NEXT:    shrdq $2, %rsi, %rcx
 ; X86-64-NEXT:    shrq $2, %rsi
 ; X86-64-NEXT:    addq %rsi, %rcx
 ; X86-64-NEXT:    adcq $0, %rcx
@@ -443,8 +443,8 @@ define i128 @urem_i128_12(i128 %x) nounwind {
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
-; WIN64-NEXT:    movq %rdx, %r8
-; WIN64-NEXT:    shldq $62, %rcx, %r8
+; WIN64-NEXT:    movq %rcx, %r8
+; WIN64-NEXT:    shrdq $2, %rdx, %r8
 ; WIN64-NEXT:    shrq $2, %rdx
 ; WIN64-NEXT:    addq %rdx, %r8
 ; WIN64-NEXT:    adcq $0, %r8
diff --git a/llvm/test/CodeGen/X86/i1-fast-isel.ll b/llvm/test/CodeGen/X86/i1-fast-isel.ll
new file mode 100644
index 0000000000000..1f111129de13c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/i1-fast-isel.ll
@@ -0,0 +1,106 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --fast-isel < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i8 @test_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo at PLT
+; CHECK-NEXT:    movzbl %al, %edi
+; CHECK-NEXT:    callq bar at PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call i1 @foo(ptr %f)
+  %call2 = call zeroext i8 @bar(i1 %call)
+  ret i8 %call2
+}
+
+define i8 @test_fast_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_fast_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo_fast at PLT
+; CHECK-NEXT:    movzbl %al, %edi
+; CHECK-NEXT:    callq bar at PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc i1 @foo_fast(ptr %f)
+  %call2 = call zeroext i8 @bar(i1 %call)
+  ret i8 %call2
+}
+
+define i8 @test_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo at PLT
+; CHECK-NEXT:    movzbl %al, %edi
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call i1 @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(i1 %call)
+  ret i8 %call2
+}
+
+define i8 @test_indirect_all2(ptr %fptr, ptr %f, i1 %cond) nounwind {
+; CHECK-LABEL: test_indirect_all2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %edx, %ebp
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo at PLT
+; CHECK-NEXT:    testb $1, %bpl
+; CHECK-NEXT:    je .LBB3_2
+; CHECK-NEXT:  # %bb.1: # %exit
+; CHECK-NEXT:    movzbl %al, %edi
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    jmp .LBB3_3
+; CHECK-NEXT:  .LBB3_2: # %exit2
+; CHECK-NEXT:    movb $3, %al
+; CHECK-NEXT:  .LBB3_3: # %exit2
+; CHECK-NEXT:    addq $8, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+entry:
+  %call = call i1 @foo(ptr %f)
+  br i1 %cond, label %exit, label %exit2
+
+exit:
+  %call2 = call zeroext i8 %fptr(i1 %call)
+  ret i8 %call2
+
+exit2:
+  ret i8 3
+}
+
+
+define i8 @test_fast_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_fast_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo at PLT
+; CHECK-NEXT:    movzbl %al, %edi
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc i1 @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(i1 %call)
+  ret i8 %call2
+}
+
+declare i1 @foo(ptr %f)
+declare zeroext i8 @bar(i1)
+declare fastcc i1 @foo_fast(ptr %f)
diff --git a/llvm/test/CodeGen/X86/inline-asm-p-constraint.ll b/llvm/test/CodeGen/X86/inline-asm-p-constraint.ll
index 50185343662b7..ce354f93fb02b 100644
--- a/llvm/test/CodeGen/X86/inline-asm-p-constraint.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-p-constraint.ll
@@ -7,7 +7,7 @@ define ptr @foo(ptr %Ptr) {
   %Ptr.addr = alloca ptr, align 8
   store ptr %Ptr, ptr %Ptr.addr, align 8
 ; CHECK: movq    %rdi, -8(%rsp)
-  %1 = tail call ptr asm "mov $1, $0\0A\09lea $2, $0", "=r,p,*m,~{dirflag},~{fpsr},~{flags}"(ptr %Ptr, ptr elementtype(ptr) %Ptr.addr)
+  %1 = tail call ptr asm "mov ${1:a}, $0\0A\09lea $2, $0", "=r,p,*m,~{dirflag},~{fpsr},~{flags}"(ptr %Ptr, ptr elementtype(ptr) %Ptr.addr)
 ; CHECK-NEXT: #APP
 ; CHECK-NEXT: mov (%rdi), %rax
 ; CHECK-NEXT: lea -8(%rsp), %rax
@@ -17,10 +17,9 @@ define ptr @foo(ptr %Ptr) {
 }
 
 define void @intptr() {
-; Don't assert on a non-ptr operand, existing code & gcc accept these.
 entry:
 ; CHECK-LABEL: intptr:
 ; CHECK: ud1l 49150(%eax), %eax
-  call void asm "ud1l $0(%eax), %eax", "p,~{dirflag},~{fpsr},~{flags}"(i32 49150)
+  call void asm "ud1l ${0:a}(%eax), %eax", "p,~{dirflag},~{fpsr},~{flags}"(i32 49150)
   unreachable
 }
diff --git a/llvm/test/CodeGen/X86/interleave-load-fold.ll b/llvm/test/CodeGen/X86/interleave-load-fold.ll
index 28f313bf6a0fa..e2430ff5e1c03 100644
--- a/llvm/test/CodeGen/X86/interleave-load-fold.ll
+++ b/llvm/test/CodeGen/X86/interleave-load-fold.ll
@@ -5,8 +5,10 @@ define <16 x i8> @interleave_masked_select(ptr %mask, ptr %src) nounwind {
 ; X64-LABEL: interleave_masked_select:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovw (%rdi), %k1
-; X64-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; X64-NEXT:    vpunpcklbw {{.*#+}} xmm0 {%k1} {z} = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
+; X64-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; X64-NEXT:    vmovdqu8 (%rsi), %xmm0 {%k1}
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpunpcklbw {{.*#+}} xmm0 {%k1} {z} = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X64-NEXT:    retq
   %mask_vec = load <16 x i1>, ptr %mask
   %vec2 = load <16 x i8>, ptr %src
@@ -19,12 +21,13 @@ define <16 x i8> @interleave_masked_select(ptr %mask, ptr %src) nounwind {
 define <16 x i1> @interleave_masked_blend(i16 %mask, ptr %src1, ptr %src2) nounwind {
 ; X64-LABEL: interleave_masked_blend:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovdqa (%rsi), %xmm0
 ; X64-NEXT:    kmovd %edi, %k1
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vpunpcklbw {{.*#+}} xmm2 {%k1} {z} = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; X64-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    vpcmpeqb %xmm0, %xmm2, %xmm0
+; X64-NEXT:    vmovdqa (%rsi), %xmm0
+; X64-NEXT:    vpblendmb (%rdx), %xmm0, %xmm1 {%k1}
+; X64-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpunpcklbw {{.*#+}} xmm1 {%k1} {z} = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; X64-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; X64-NEXT:    vpcmpeqb %xmm0, %xmm1, %xmm0
 ; X64-NEXT:    retq
   %mask_vec = bitcast i16 %mask to <16 x i1>
   %vec1 = load <16 x i8>, ptr %src1
diff --git a/llvm/test/CodeGen/X86/known-never-zero.ll b/llvm/test/CodeGen/X86/known-never-zero.ll
index 8327a90bdeda5..d98bb21bb1656 100644
--- a/llvm/test/CodeGen/X86/known-never-zero.ll
+++ b/llvm/test/CodeGen/X86/known-never-zero.ll
@@ -2704,6 +2704,55 @@ define i32 @sext_known_nonzero(i16 %xx) {
   ret i32 %r
 }
 
+define i32 @sext_known_nonzero_vec(<8 x i16> %xx, ptr %p) {
+; X86-LABEL: sext_known_nonzero_vec:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pxor %xmm1, %xmm1
+; X86-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; X86-NEXT:    pslld $23, %xmm0
+; X86-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    cvttps2dq %xmm0, %xmm0
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; X86-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,2,4,5,6,7]
+; X86-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,4]
+; X86-NEXT:    psrad $16, %xmm0
+; X86-NEXT:    movdqa %xmm1, 16(%eax)
+; X86-NEXT:    movdqa %xmm0, (%eax)
+; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; X86-NEXT:    movd %xmm0, %eax
+; X86-NEXT:    bsfl %eax, %ecx
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sext_known_nonzero_vec:
+; X64:       # %bb.0:
+; X64-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; X64-NEXT:    vpslld $23, %xmm0, %xmm0
+; X64-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vcvttps2dq %xmm0, %xmm0
+; X64-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7]
+; X64-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; X64-NEXT:    vpmovsxwd %xmm0, %xmm0
+; X64-NEXT:    vmovdqa %xmm1, 16(%rdi)
+; X64-NEXT:    vmovdqa %xmm0, (%rdi)
+; X64-NEXT:    vpextrd $1, %xmm0, %ecx
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    rep bsfl %ecx, %eax
+; X64-NEXT:    retq
+  %x = shl <8 x i16> <i16 0, i16 0, i16 1, i16 0, i16 0, i16 0, i16 0, i16 0>, %xx
+  %s = shufflevector <8 x i16> %x, <8 x i16> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %z = sext <8 x i16> %s to <8 x i32>
+  store <8 x i32> %z, ptr %p
+  %e = extractelement <8 x i32> %z, i32 1
+  %r = call i32 @llvm.cttz.i32(i32 %e, i1 false)
+  ret i32 %r
+}
+
 define i32 @sext_maybe_zero(i16 %x) {
 ; X86-LABEL: sext_maybe_zero:
 ; X86:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index 66ceb7f9508f1..a15d63cfa1912 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -164,7 +164,6 @@ define i1 @pow2_srl(i32 %x, i32 %y) {
 define i32 @pow2_srl_vec(<4 x i32> %x, <4 x i32> %y, i32 %z, ptr %p) {
 ; CHECK-LABEL: pow2_srl_vec:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[2,3,3,3,4,5,6,7]
 ; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [1048576,4294967295,4294967295,0]
@@ -173,7 +172,7 @@ define i32 @pow2_srl_vec(<4 x i32> %x, <4 x i32> %y, i32 %z, ptr %p) {
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,1,4,5,6,7]
 ; CHECK-NEXT:    movdqa %xmm2, %xmm4
 ; CHECK-NEXT:    psrld %xmm0, %xmm4
-; CHECK-NEXT:    movd %xmm4, %ecx
+; CHECK-NEXT:    movd %xmm4, %eax
 ; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
@@ -181,9 +180,8 @@ define i32 @pow2_srl_vec(<4 x i32> %x, <4 x i32> %y, i32 %z, ptr %p) {
 ; CHECK-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
 ; CHECK-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3]
 ; CHECK-NEXT:    movaps %xmm4, (%rsi)
-; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:    divl %ecx
-; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    andl %edi, %eax
 ; CHECK-NEXT:    retq
   %yy = and <4 x i32> %y, splat (i32 7)
   %d = lshr <4 x i32> <i32 1048576, i32 -1, i32 -1, i32 0>, %yy
@@ -351,6 +349,41 @@ define i1 @pow2_umin(i32 %x, i32 %y) {
   ret i1 %r
 }
 
+define i32 @pow2_umin_vec(<4 x i32> %x, <4 x i32> %y, i32 %z, ptr %p) {
+; CHECK-LABEL: pow2_umin_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    pslld $23, %xmm0
+; CHECK-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4294967295,4294967295,4294967295]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT:    pxor %xmm0, %xmm1
+; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pandn %xmm0, %xmm2
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    por %xmm2, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, (%rsi)
+; CHECK-NEXT:    movd %xmm1, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ecx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retq
+  %yy = shl <4 x i32> <i32 1, i32 -1, i32 -1, i32 -1>, %x
+  %d = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %yy, <4 x i32> splat (i32 256))
+  store <4 x i32> %d, ptr %p
+  %elt = extractelement <4 x i32> %d, i32 0
+  %r = urem i32 %z, %elt
+  ret i32 %r
+}
+
 define i1 @pow2_umin_fail0(i32 %x, i32 %y) {
 ; CHECK-LABEL: pow2_umin_fail0:
 ; CHECK:       # %bb.0:
@@ -416,6 +449,46 @@ define i1 @pow2_umax(i32 %x, i32 %y, i32 %z) {
   ret i1 %r
 }
 
+define i32 @pow2_umax_vec(<4 x i32> %x, <4 x i32> %y, i32 %z, ptr %p) {
+; CHECK-LABEL: pow2_umax_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [4096,4294967295,4294967295,4294967295]
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    psrld %xmm1, %xmm3
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,1,1,4,5,6,7]
+; CHECK-NEXT:    movdqa %xmm2, %xmm4
+; CHECK-NEXT:    psrld %xmm1, %xmm4
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7]
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    psrld %xmm1, %xmm3
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; CHECK-NEXT:    psrld %xmm0, %xmm2
+; CHECK-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; CHECK-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3]
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT:    xorps %xmm4, %xmm0
+; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    andps %xmm0, %xmm4
+; CHECK-NEXT:    andnps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm4, %xmm0
+; CHECK-NEXT:    movaps %xmm0, (%rsi)
+; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ecx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retq
+  %yy = lshr <4 x i32> <i32 4096, i32 -1, i32 -1, i32 -1>, %x
+  %d = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %yy, <4 x i32> splat (i32 256))
+  store <4 x i32> %d, ptr %p
+  %elt = extractelement <4 x i32> %d, i32 0
+  %r = urem i32 %z, %elt
+  ret i32 %r
+}
+
 define i1 @pow2_umax_fail0(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: pow2_umax_fail0:
 ; CHECK:       # %bb.0:
@@ -484,6 +557,40 @@ define i1 @pow2_smin(i32 %x, i32 %y) {
   ret i1 %r
 }
 
+define i32 @pow2_smin_vec(<4 x i32> %x, <4 x i32> %y, i32 %z, ptr %p) {
+; CHECK-LABEL: pow2_smin_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    pslld $23, %xmm0
+; CHECK-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
+; CHECK-NEXT:    pmuludq %xmm1, %xmm2
+; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; CHECK-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,4294967295,4294967295,4294967295]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [256,256,256,256]
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    pandn %xmm1, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    movdqa %xmm2, (%rsi)
+; CHECK-NEXT:    movd %xmm2, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ecx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retq
+  %yy = shl <4 x i32> <i32 1, i32 -1, i32 -1, i32 -1>, %x
+  %d = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %yy, <4 x i32> splat (i32 256))
+  store <4 x i32> %d, ptr %p
+  %elt = extractelement <4 x i32> %d, i32 0
+  %r = urem i32 %z, %elt
+  ret i32 %r
+}
+
 define i1 @pow2_smin_fail0(i32 %x, i32 %y) {
 ; CHECK-LABEL: pow2_smin_fail0:
 ; CHECK:       # %bb.0:
@@ -549,6 +656,46 @@ define i1 @pow2_smax(i32 %x, i32 %y, i32 %z) {
   ret i1 %r
 }
 
+define i32 @pow2_smax_vec(<4 x i32> %x, <4 x i32> %y, i32 %z, ptr %p) {
+; CHECK-LABEL: pow2_smax_vec:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [4096,4294967295,4294967295,4294967295]
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    psrld %xmm1, %xmm3
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,1,1,1,4,5,6,7]
+; CHECK-NEXT:    movdqa %xmm2, %xmm4
+; CHECK-NEXT:    psrld %xmm1, %xmm4
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[2,3,3,3,4,5,6,7]
+; CHECK-NEXT:    movdqa %xmm2, %xmm3
+; CHECK-NEXT:    psrld %xmm1, %xmm3
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,1,4,5,6,7]
+; CHECK-NEXT:    psrld %xmm0, %xmm2
+; CHECK-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm3[1]
+; CHECK-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm2[0,3]
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [256,256,256,256]
+; CHECK-NEXT:    movaps %xmm4, %xmm1
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT:    andps %xmm1, %xmm4
+; CHECK-NEXT:    pandn %xmm0, %xmm1
+; CHECK-NEXT:    por %xmm4, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, (%rsi)
+; CHECK-NEXT:    movd %xmm1, %ecx
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    divl %ecx
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    retq
+  %yy = lshr <4 x i32> <i32 4096, i32 -1, i32 -1, i32 -1>, %x
+  %d = call <4 x i32> @llvm.smax.v4i32(<4 x i32> %yy, <4 x i32> splat (i32 256))
+  store <4 x i32> %d, ptr %p
+  %elt = extractelement <4 x i32> %d, i32 0
+  %r = urem i32 %z, %elt
+  ret i32 %r
+}
+
 define i1 @pow2_smax_fail0(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: pow2_smax_fail0:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
index b4cab9a325c93..5b5280601ea71 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
@@ -353,8 +353,7 @@ define <2 x i32> @test_gather_v2i32_data_index(ptr %base, <2 x i32> %ind, <2 x i
 ;
 ; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index:
 ; WIDEN_AVX2:       # %bb.0:
-; WIDEN_AVX2-NEXT:    vxorps %xmm3, %xmm3, %xmm3
-; WIDEN_AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[2,3]
+; WIDEN_AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; WIDEN_AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
 ; WIDEN_AVX2-NEXT:    vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2
 ; WIDEN_AVX2-NEXT:    vmovdqa %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 651676ca7bee5..99a8918fef93f 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -1208,8 +1208,7 @@ define <2 x float> @load_v2f32_i2(i2 %trigger, ptr %addr, <2 x float> %dst) {
 ; AVX1OR2-NEXT:    andb $1, %dil
 ; AVX1OR2-NEXT:    vmovd %edi, %xmm1
 ; AVX1OR2-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX1OR2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX1OR2-NEXT:    vpslld $31, %xmm1, %xmm1
 ; AVX1OR2-NEXT:    vmaskmovps (%rsi), %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_packss.ll b/llvm/test/CodeGen/X86/masked_packss.ll
index d84eaeaae60b7..f4e9063a7baff 100644
--- a/llvm/test/CodeGen/X86/masked_packss.ll
+++ b/llvm/test/CodeGen/X86/masked_packss.ll
@@ -75,15 +75,8 @@ define <64 x i8> @_mm512_mask_packss_epi16_manual(<64 x i8> %src, i64 noundef %k
 ;
 ; AVX512-LABEL: _mm512_mask_packss_epi16_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vpmovswb %zmm4, %ymm1
-; AVX512-NEXT:    vpmovswb %zmm3, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    kmovq %rdi, %k1
-; AVX512-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpacksswb %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 -128))
@@ -167,15 +160,8 @@ define <32 x i16> @_mm512_mask_packss_epi32_manual(<32 x i16> %src, i32 noundef
 ;
 ; AVX512-LABEL: _mm512_mask_packss_epi32_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vpmovsdw %zmm4, %ymm1
-; AVX512-NEXT:    vpmovsdw %zmm3, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    kmovd %edi, %k1
-; AVX512-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpackssdw %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
   %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 -32768))
diff --git a/llvm/test/CodeGen/X86/masked_packus.ll b/llvm/test/CodeGen/X86/masked_packus.ll
index 52507542945c7..c0cb978f3a0dc 100644
--- a/llvm/test/CodeGen/X86/masked_packus.ll
+++ b/llvm/test/CodeGen/X86/masked_packus.ll
@@ -75,18 +75,8 @@ define <64 x i8> @_mm512_mask_packus_epi16_manual(<64 x i8> %src, i64 noundef %k
 ;
 ; AVX512-LABEL: _mm512_mask_packus_epi16_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsw %zmm1, %zmm4, %zmm2
-; AVX512-NEXT:    vpmaxsw %zmm1, %zmm3, %zmm1
-; AVX512-NEXT:    vpmovuswb %zmm1, %ymm1
-; AVX512-NEXT:    vpmovuswb %zmm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    kmovq %rdi, %k1
-; AVX512-NEXT:    vmovdqu8 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpackuswb %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %sh = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 0))
@@ -170,18 +160,8 @@ define <32 x i16> @_mm512_mask_packus_epi32_manual(<32 x i16> %src, i32 noundef
 ;
 ; AVX512-LABEL: _mm512_mask_packus_epi32_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm3
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm4 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %zmm1, %zmm4, %zmm2
-; AVX512-NEXT:    vpmaxsd %zmm1, %zmm3, %zmm1
-; AVX512-NEXT:    vpmovusdw %zmm1, %ymm1
-; AVX512-NEXT:    vpmovusdw %zmm2, %ymm2
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512-NEXT:    kmovd %edi, %k1
-; AVX512-NEXT:    vmovdqu16 %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    vpackusdw %zmm2, %zmm1, %zmm0 {%k1}
 ; AVX512-NEXT:    retq
   %sh = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
   %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 0))
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 67507304d8bae..bdbb912a71d36 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -469,8 +469,7 @@ define void @store_v2f32_i2(i2 %trigger, ptr %addr, <2 x float> %val) nounwind {
 ; AVX1OR2-NEXT:    andb $1, %dil
 ; AVX1OR2-NEXT:    vmovd %edi, %xmm1
 ; AVX1OR2-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX1OR2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX1OR2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX1OR2-NEXT:    vpslld $31, %xmm1, %xmm1
 ; AVX1OR2-NEXT:    vmaskmovps %xmm0, %xmm1, (%rsi)
 ; AVX1OR2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 3116649d85677..ecf4fbb603a8f 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -1808,9 +1808,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX1-NEXT:    vmaskmovps %xmm0, %xmm1, (%rdi)
 ; AVX1-NEXT:    retq
@@ -1819,9 +1819,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX2-NEXT:    vpmaskmovd %xmm0, %xmm1, (%rdi)
 ; AVX2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index b5dd7dc0ccaf4..f56dabc99f595 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -2607,9 +2607,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [2147483647,2147483647]
 ; AVX1-NEXT:    # xmm2 = mem[0,0]
 ; AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
@@ -2626,9 +2626,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647]
 ; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
 ; AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index e43ac65fd0e08..d214fb694252f 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -2284,9 +2284,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX1-NEXT:    vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -2298,9 +2298,9 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[2,3]
+; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
 ; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
 ; AVX2-NEXT:    vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/movbe.ll b/llvm/test/CodeGen/X86/movbe.ll
index 721823344867d..379cb8f31c3de 100644
--- a/llvm/test/CodeGen/X86/movbe.ll
+++ b/llvm/test/CodeGen/X86/movbe.ll
@@ -1,8 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=x86_64-linux -mcpu=atom < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64-linux -mcpu=slm < %s | FileCheck %s -check-prefix=SLM
-; RUN: llc -mtriple=x86_64-linux -mattr=+egpr,+ndd,+movbe --show-mc-encoding < %s | FileCheck %s -check-prefix=EGPR
-; RUN: llc -mtriple=x86_64-linux -mattr=+egpr,+ndd --show-mc-encoding < %s | FileCheck %s -check-prefix=NOMOVBE
+; RUN: llc -mtriple=x86_64-linux -mattr=+egpr,+ndd,+prefer-ndd-mem,+movbe < %s | FileCheck %s -check-prefix=EGPR
+; RUN: llc -mtriple=x86_64-linux -mattr=+egpr,+ndd,-prefer-ndd-mem,+movbe < %s | FileCheck %s -check-prefix=EGPR
+; RUN: llc -mtriple=x86_64-linux -mattr=+egpr,+ndd,+prefer-ndd-mem --show-mc-encoding < %s | FileCheck %s -check-prefix=NOMOVBE
 declare i16 @llvm.bswap.i16(i16) nounwind readnone
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
@@ -26,8 +27,8 @@ define void @test1(ptr nocapture %x, i16 %y) nounwind {
 ;
 ; EGPR-LABEL: test1:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    movbew %si, (%rdi) # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x38,0xf1,0x37]
-; EGPR-NEXT:    retq # encoding: [0xc3]
+; EGPR-NEXT:    movbew %si, (%rdi)
+; EGPR-NEXT:    retq
 ;
 ; NOMOVBE-LABEL: test1:
 ; NOMOVBE:       # %bb.0:
@@ -58,8 +59,8 @@ define i16 @test2(ptr %x) nounwind {
 ;
 ; EGPR-LABEL: test2:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    movbew (%rdi), %ax # EVEX TO LEGACY Compression encoding: [0x66,0x0f,0x38,0xf0,0x07]
-; EGPR-NEXT:    retq # encoding: [0xc3]
+; EGPR-NEXT:    movbew (%rdi), %ax
+; EGPR-NEXT:    retq
 ;
 ; NOMOVBE-LABEL: test2:
 ; NOMOVBE:       # %bb.0:
@@ -89,8 +90,8 @@ define void @test3(ptr nocapture %x, i32 %y) nounwind {
 ;
 ; EGPR-LABEL: test3:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    movbel %esi, (%rdi) # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0xf1,0x37]
-; EGPR-NEXT:    retq # encoding: [0xc3]
+; EGPR-NEXT:    movbel %esi, (%rdi)
+; EGPR-NEXT:    retq
 ;
 ; NOMOVBE-LABEL: test3:
 ; NOMOVBE:       # %bb.0:
@@ -121,8 +122,8 @@ define i32 @test4(ptr %x) nounwind {
 ;
 ; EGPR-LABEL: test4:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    movbel (%rdi), %eax # EVEX TO LEGACY Compression encoding: [0x0f,0x38,0xf0,0x07]
-; EGPR-NEXT:    retq # encoding: [0xc3]
+; EGPR-NEXT:    movbel (%rdi), %eax
+; EGPR-NEXT:    retq
 ;
 ; NOMOVBE-LABEL: test4:
 ; NOMOVBE:       # %bb.0:
@@ -153,8 +154,8 @@ define void @test5(ptr %x, i64 %y) nounwind {
 ;
 ; EGPR-LABEL: test5:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    movbeq %rsi, (%rdi) # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x38,0xf1,0x37]
-; EGPR-NEXT:    retq # encoding: [0xc3]
+; EGPR-NEXT:    movbeq %rsi, (%rdi)
+; EGPR-NEXT:    retq
 ;
 ; NOMOVBE-LABEL: test5:
 ; NOMOVBE:       # %bb.0:
@@ -185,8 +186,8 @@ define i64 @test6(ptr %x) nounwind {
 ;
 ; EGPR-LABEL: test6:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    movbeq (%rdi), %rax # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0x38,0xf0,0x07]
-; EGPR-NEXT:    retq # encoding: [0xc3]
+; EGPR-NEXT:    movbeq (%rdi), %rax
+; EGPR-NEXT:    retq
 ;
 ; NOMOVBE-LABEL: test6:
 ; NOMOVBE:       # %bb.0:
@@ -217,8 +218,8 @@ define i64 @test7(i64 %x) nounwind {
 ;
 ; EGPR-LABEL: test7:
 ; EGPR:       # %bb.0:
-; EGPR-NEXT:    movbeq %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x08,0x61,0xf8]
-; EGPR-NEXT:    retq # encoding: [0xc3]
+; EGPR-NEXT:    movbeq %rdi, %rax
+; EGPR-NEXT:    retq
 ;
 ; NOMOVBE-LABEL: test7:
 ; NOMOVBE:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll b/llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll
new file mode 100644
index 0000000000000..0135ec230cf28
--- /dev/null
+++ b/llvm/test/CodeGen/X86/mul-lohi-no-implicit-copy.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=-bmi2 | FileCheck %s
+;
+; For UMUL_LOHI lowering without BMI2, MUL/IMUL use RAX as an implicit source.
+; If one operand already lives in RAX (e.g. call result), avoid shuffling it out
+; and back before the multiply.
+
+declare i32 @foo32()
+declare i64 @foo64()
+
+define i32 @mul32_no_implicit_copy(i32 %a0) nounwind {
+; CHECK-LABEL: mul32_no_implicit_copy:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movl %edi, %ebx
+; CHECK-NEXT:    callq foo32 at PLT
+; CHECK-NEXT:    mull %ebx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %a1 = call i32 @foo32()
+  %a2 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %a0, i32 %a1)
+  %a3 = extractvalue { i32, i1 } %a2, 0
+  ret i32 %a3
+}
+
+define i64 @mul64_no_implicit_copy(i64 %a0) nounwind {
+; CHECK-LABEL: mul64_no_implicit_copy:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    callq foo64 at PLT
+; CHECK-NEXT:    mulq %rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+  %a1 = call i64 @foo64()
+  %a2 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %a0, i64 %a1)
+  %a3 = extractvalue { i64, i1 } %a2, 0
+  ret i64 %a3
+}
diff --git a/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir b/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir
index 3281218848c0a..95cb460495512 100644
--- a/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir
+++ b/llvm/test/CodeGen/X86/non-foldable-with-the-same-mask.mir
@@ -21,9 +21,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VALIGNDZ128rmik:%[0-9]+]]:vr128 = VALIGNDZ128rmik [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 1 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VALIGNDZ128rmik]]
+    ; CHECK-NEXT: [[VALIGNDZ128rrik:%[0-9]+]]:vr128 = VALIGNDZ128rrik [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQA32Z128rmkz]], 1
+    ; CHECK-NEXT: $xmm0 = COPY [[VALIGNDZ128rrik]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -53,9 +54,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPALIGNRZ128rmikz:%[0-9]+]]:vr128 = VPALIGNRZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 4 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPALIGNRZ128rmikz]]
+    ; CHECK-NEXT: [[VPALIGNRZ128rrikz:%[0-9]+]]:vr128 = VPALIGNRZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU8Z128rmkz]], 4
+    ; CHECK-NEXT: $xmm0 = COPY [[VPALIGNRZ128rrikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk16wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -84,8 +86,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VBROADCASTSSZ128rmk:%[0-9]+]]:vr128 = VBROADCASTSSZ128rmk [[AVX512_128_SET0_]], [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VBROADCASTSSZ128rmk]]
+    ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VBROADCASTSSZ128rrk:%[0-9]+]]:vr128 = VBROADCASTSSZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[VMOVAPSZ128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VBROADCASTSSZ128rrk]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -113,8 +116,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk2wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VMOVDDUPZ128rmkz:%[0-9]+]]:vr128 = VMOVDDUPZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VMOVDDUPZ128rmkz]]
+    ; CHECK-NEXT: [[VMOVAPDZ128rmkz:%[0-9]+]]:vr128x = VMOVAPDZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VMOVDDUPZ128rrkz:%[0-9]+]]:vr128 = VMOVDDUPZ128rrkz [[COPY]], [[VMOVAPDZ128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VMOVDDUPZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk2wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -142,8 +146,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VMOVSHDUPZ128rmkz:%[0-9]+]]:vr128 = VMOVSHDUPZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VMOVSHDUPZ128rmkz]]
+    ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VMOVSHDUPZ128rrkz:%[0-9]+]]:vr128 = VMOVSHDUPZ128rrkz [[COPY]], [[VMOVAPSZ128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VMOVSHDUPZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -171,8 +176,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPBROADCASTDZ128rmkz:%[0-9]+]]:vr128 = VPBROADCASTDZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPBROADCASTDZ128rmkz]]
+    ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPBROADCASTDZ128rrkz:%[0-9]+]]:vr128 = VPBROADCASTDZ128rrkz [[COPY]], [[VMOVDQA32Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPBROADCASTDZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -201,9 +207,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VDBPSADBWZ128rmikz:%[0-9]+]]:vr128 = VDBPSADBWZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 0 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VDBPSADBWZ128rmikz]]
+    ; CHECK-NEXT: [[VDBPSADBWZ128rrikz:%[0-9]+]]:vr128 = VDBPSADBWZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]], 0
+    ; CHECK-NEXT: $xmm0 = COPY [[VDBPSADBWZ128rrikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk8wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -233,9 +240,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VGF2P8AFFINEQBZ128rmikz:%[0-9]+]]:vr128 = VGF2P8AFFINEQBZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 0 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VGF2P8AFFINEQBZ128rmikz]]
+    ; CHECK-NEXT: [[VGF2P8AFFINEQBZ128rrikz:%[0-9]+]]:vr128 = VGF2P8AFFINEQBZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU8Z128rmkz]], 0
+    ; CHECK-NEXT: $xmm0 = COPY [[VGF2P8AFFINEQBZ128rrikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk16wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -265,9 +273,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VMPSADBWZ128rmikz:%[0-9]+]]:vr128 = VMPSADBWZ128rmikz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 0 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VMPSADBWZ128rmikz]]
+    ; CHECK-NEXT: [[VMPSADBWZ128rrikz:%[0-9]+]]:vr128 = VMPSADBWZ128rrikz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]], 0
+    ; CHECK-NEXT: $xmm0 = COPY [[VMPSADBWZ128rrikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk8wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -296,8 +305,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPCONFLICTDZ128rmk:%[0-9]+]]:vr128 = VPCONFLICTDZ128rmk [[AVX512_128_SET0_]], [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPCONFLICTDZ128rmk]]
+    ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPCONFLICTDZ128rrk:%[0-9]+]]:vr128 = VPCONFLICTDZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[VMOVDQA32Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPCONFLICTDZ128rrk]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -326,9 +336,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPMULTISHIFTQBZ128rmkz:%[0-9]+]]:vr128 = VPMULTISHIFTQBZ128rmkz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPMULTISHIFTQBZ128rmkz]]
+    ; CHECK-NEXT: [[VPMULTISHIFTQBZ128rrkz:%[0-9]+]]:vr128 = VPMULTISHIFTQBZ128rrkz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU8Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPMULTISHIFTQBZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk16wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -357,8 +368,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VEXPANDPSZ128rmk:%[0-9]+]]:vr128 = VEXPANDPSZ128rmk [[AVX512_128_SET0_]], [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VEXPANDPSZ128rmk]]
+    ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VEXPANDPSZ128rrk:%[0-9]+]]:vr128 = VEXPANDPSZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[VMOVAPSZ128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VEXPANDPSZ128rrk]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -385,11 +397,12 @@ body:             |
     ; CHECK-LABEL: name: test_vinserti32x4_same_mask
     ; CHECK: liveins: $rdi, $k1
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
     ; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
-    ; CHECK-NEXT: [[VINSERTI32X4Z256rmikz:%[0-9]+]]:vr256 = VINSERTI32X4Z256rmikz [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 1 :: (load (s128))
-    ; CHECK-NEXT: $ymm0 = COPY [[VINSERTI32X4Z256rmikz]]
+    ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VINSERTI32X4Z256rrikz:%[0-9]+]]:vr256 = VINSERTI32X4Z256rrikz [[COPY]], [[AVX512_256_SET0_1]], [[VMOVDQA32Z128rmkz]], 1
+    ; CHECK-NEXT: $ymm0 = COPY [[VINSERTI32X4Z256rrikz]]
     ; CHECK-NEXT: RET 0, $ymm0
     %0:vk4wm = COPY $k1
     %1:vr256x = AVX512_256_SET0
@@ -419,9 +432,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPACKSSDWZ128rmk:%[0-9]+]]:vr128 = VPACKSSDWZ128rmk [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPACKSSDWZ128rmk]]
+    ; CHECK-NEXT: [[VPACKSSDWZ128rrk:%[0-9]+]]:vr128 = VPACKSSDWZ128rrk [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPACKSSDWZ128rrk]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk8wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -451,9 +465,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
+    ; CHECK-NEXT: [[VMOVDQA32Z256rmkz:%[0-9]+]]:vr256x = VMOVDQA32Z256rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
     ; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
-    ; CHECK-NEXT: [[VPERMDZ256rmk:%[0-9]+]]:vr256 = VPERMDZ256rmk [[AVX512_256_SET0_]], [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
-    ; CHECK-NEXT: $ymm0 = COPY [[VPERMDZ256rmk]]
+    ; CHECK-NEXT: [[VPERMDZ256rrk:%[0-9]+]]:vr256 = VPERMDZ256rrk [[AVX512_256_SET0_]], [[COPY]], [[AVX512_256_SET0_1]], [[VMOVDQA32Z256rmkz]]
+    ; CHECK-NEXT: $ymm0 = COPY [[VPERMDZ256rrk]]
     ; CHECK-NEXT: RET 0, $ymm0
     %0:vk8wm = COPY $k1
     %1:vr256x = AVX512_256_SET0
@@ -483,9 +498,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPERMI2DZ128rmkz:%[0-9]+]]:vr128x = VPERMI2DZ128rmkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPERMI2DZ128rmkz]]
+    ; CHECK-NEXT: [[VPERMI2DZ128rrkz:%[0-9]+]]:vr128x = VPERMI2DZ128rrkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQA32Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPERMI2DZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -512,8 +528,9 @@ body:             |
     ; CHECK: liveins: $rdi, $k1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
-    ; CHECK-NEXT: [[VPERMILPSZ128mikz:%[0-9]+]]:vr128 = VPERMILPSZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPERMILPSZ128mikz]]
+    ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPERMILPSZ128rikz:%[0-9]+]]:vr128 = VPERMILPSZ128rikz [[COPY]], [[VMOVAPSZ128rmkz]], 27
+    ; CHECK-NEXT: $xmm0 = COPY [[VPERMILPSZ128rikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = VMOVAPSZ128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -541,9 +558,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
+    ; CHECK-NEXT: [[VMOVAPSZ256rmkz:%[0-9]+]]:vr256x = VMOVAPSZ256rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
     ; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
-    ; CHECK-NEXT: [[VPERMPSZ256rmkz:%[0-9]+]]:vr256 = VPERMPSZ256rmkz [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
-    ; CHECK-NEXT: $ymm0 = COPY [[VPERMPSZ256rmkz]]
+    ; CHECK-NEXT: [[VPERMPSZ256rrkz:%[0-9]+]]:vr256 = VPERMPSZ256rrkz [[COPY]], [[AVX512_256_SET0_1]], [[VMOVAPSZ256rmkz]]
+    ; CHECK-NEXT: $ymm0 = COPY [[VPERMPSZ256rrkz]]
     ; CHECK-NEXT: RET 0, $ymm0
     %0:vk8wm = COPY $k1
     %1:vr256x = AVX512_256_SET0
@@ -573,9 +591,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPERMT2DZ128rmkz:%[0-9]+]]:vr128x = VPERMT2DZ128rmkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPERMT2DZ128rmkz]]
+    ; CHECK-NEXT: [[VPERMT2DZ128rrkz:%[0-9]+]]:vr128x = VPERMT2DZ128rrkz [[AVX512_128_SET0_]], [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQA32Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPERMT2DZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -605,9 +624,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
+    ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
     ; CHECK-NEXT: [[AVX512_128_SET0_1:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPERMWZ128rmkz:%[0-9]+]]:vr128 = VPERMWZ128rmkz [[COPY]], [[AVX512_128_SET0_1]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPERMWZ128rmkz]]
+    ; CHECK-NEXT: [[VPERMWZ128rrkz:%[0-9]+]]:vr128 = VPERMWZ128rrkz [[COPY]], [[AVX512_128_SET0_1]], [[VMOVDQU16Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPERMWZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk8wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -634,8 +654,9 @@ body:             |
     ; CHECK: liveins: $rdi, $k1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
-    ; CHECK-NEXT: [[VPSHUFDZ128mikz:%[0-9]+]]:vr128 = VPSHUFDZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFDZ128mikz]]
+    ; CHECK-NEXT: [[VMOVDQA32Z128rmkz:%[0-9]+]]:vr128x = VMOVDQA32Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPSHUFDZ128rikz:%[0-9]+]]:vr128 = VPSHUFDZ128rikz [[COPY]], [[VMOVDQA32Z128rmkz]], 27
+    ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFDZ128rikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = VMOVDQA32Z128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -660,8 +681,9 @@ body:             |
     ; CHECK: liveins: $rdi, $k1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
-    ; CHECK-NEXT: [[VPSHUFHWZ128mikz:%[0-9]+]]:vr128 = VPSHUFHWZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFHWZ128mikz]]
+    ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPSHUFHWZ128rikz:%[0-9]+]]:vr128 = VPSHUFHWZ128rikz [[COPY]], [[VMOVDQU16Z128rmkz]], 27
+    ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFHWZ128rikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk8wm = COPY $k1
     %1:vr128x = VMOVDQU16Z128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -686,8 +708,9 @@ body:             |
     ; CHECK: liveins: $rdi, $k1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
-    ; CHECK-NEXT: [[VPSHUFLWZ128mikz:%[0-9]+]]:vr128 = VPSHUFLWZ128mikz [[COPY]], $rdi, 1, $noreg, 0, $noreg, 27 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFLWZ128mikz]]
+    ; CHECK-NEXT: [[VMOVDQU16Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU16Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPSHUFLWZ128rikz:%[0-9]+]]:vr128 = VPSHUFLWZ128rikz [[COPY]], [[VMOVDQU16Z128rmkz]], 27
+    ; CHECK-NEXT: $xmm0 = COPY [[VPSHUFLWZ128rikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk8wm = COPY $k1
     %1:vr128x = VMOVDQU16Z128rmkz %0, $rdi, 1, $noreg, 0, $noreg :: (load (s128))
@@ -715,9 +738,10 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk8wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_256_SET0_:%[0-9]+]]:vr256x = AVX512_256_SET0
+    ; CHECK-NEXT: [[VMOVAPSZ256rmkz:%[0-9]+]]:vr256x = VMOVAPSZ256rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s256))
     ; CHECK-NEXT: [[AVX512_256_SET0_1:%[0-9]+]]:vr256x = AVX512_256_SET0
-    ; CHECK-NEXT: [[VSHUFF32X4Z256rmikz:%[0-9]+]]:vr256 = VSHUFF32X4Z256rmikz [[COPY]], [[AVX512_256_SET0_1]], $rdi, 1, $noreg, 0, $noreg, 2 :: (load (s256))
-    ; CHECK-NEXT: $ymm0 = COPY [[VSHUFF32X4Z256rmikz]]
+    ; CHECK-NEXT: [[VSHUFF32X4Z256rrikz:%[0-9]+]]:vr256 = VSHUFF32X4Z256rrikz [[COPY]], [[AVX512_256_SET0_1]], [[VMOVAPSZ256rmkz]], 2
+    ; CHECK-NEXT: $ymm0 = COPY [[VSHUFF32X4Z256rrikz]]
     ; CHECK-NEXT: RET 0, $ymm0
     %0:vk8wm = COPY $k1
     %1:vr256x = AVX512_256_SET0
@@ -746,8 +770,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk2wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VSHUFPDZ128rmikz:%[0-9]+]]:vr128 = VSHUFPDZ128rmikz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg, 1 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPDZ128rmikz]]
+    ; CHECK-NEXT: [[VMOVAPDZ128rmkz:%[0-9]+]]:vr128x = VMOVAPDZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VSHUFPDZ128rrikz:%[0-9]+]]:vr128 = VSHUFPDZ128rrikz [[COPY]], [[AVX512_128_SET0_]], [[VMOVAPDZ128rmkz]], 1
+    ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPDZ128rrikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk2wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -775,8 +800,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VSHUFPSZ128rmikz:%[0-9]+]]:vr128 = VSHUFPSZ128rmikz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg, 68 :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPSZ128rmikz]]
+    ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VSHUFPSZ128rrikz:%[0-9]+]]:vr128 = VSHUFPSZ128rrikz [[COPY]], [[AVX512_128_SET0_]], [[VMOVAPSZ128rmkz]], 68
+    ; CHECK-NEXT: $xmm0 = COPY [[VSHUFPSZ128rrikz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -804,8 +830,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPUNPCKHBWZ128rmkz:%[0-9]+]]:vr128 = VPUNPCKHBWZ128rmkz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKHBWZ128rmkz]]
+    ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPUNPCKHBWZ128rrkz:%[0-9]+]]:vr128 = VPUNPCKHBWZ128rrkz [[COPY]], [[AVX512_128_SET0_]], [[VMOVDQU8Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKHBWZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk16wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -833,8 +860,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk16wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VPUNPCKLBWZ128rmkz:%[0-9]+]]:vr128 = VPUNPCKLBWZ128rmkz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKLBWZ128rmkz]]
+    ; CHECK-NEXT: [[VMOVDQU8Z128rmkz:%[0-9]+]]:vr128x = VMOVDQU8Z128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VPUNPCKLBWZ128rrkz:%[0-9]+]]:vr128 = VPUNPCKLBWZ128rrkz [[COPY]], [[AVX512_128_SET0_]], [[VMOVDQU8Z128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VPUNPCKLBWZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk16wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
@@ -862,8 +890,9 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vk4wm = COPY $k1
     ; CHECK-NEXT: [[AVX512_128_SET0_:%[0-9]+]]:vr128x = AVX512_128_SET0
-    ; CHECK-NEXT: [[VUNPCKLPSZ128rmkz:%[0-9]+]]:vr128 = VUNPCKLPSZ128rmkz [[COPY]], [[AVX512_128_SET0_]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
-    ; CHECK-NEXT: $xmm0 = COPY [[VUNPCKLPSZ128rmkz]]
+    ; CHECK-NEXT: [[VMOVAPSZ128rmkz:%[0-9]+]]:vr128x = VMOVAPSZ128rmkz [[COPY]], $rdi, 1, $noreg, 0, $noreg :: (load (s128))
+    ; CHECK-NEXT: [[VUNPCKLPSZ128rrkz:%[0-9]+]]:vr128 = VUNPCKLPSZ128rrkz [[COPY]], [[AVX512_128_SET0_]], [[VMOVAPSZ128rmkz]]
+    ; CHECK-NEXT: $xmm0 = COPY [[VUNPCKLPSZ128rrkz]]
     ; CHECK-NEXT: RET 0, $xmm0
     %0:vk4wm = COPY $k1
     %1:vr128x = AVX512_128_SET0
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 55d386d0f0952..68ff419b0588c 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -70,6 +70,9 @@
 ; CHECK-NEXT:       X86 Partial Reduction
 ; CHECK-NEXT:       Expand indirectbr instructions
 ; CHECK-NEXT:       Natural Loop Information
+; CHECK-NEXT:       Post-Dominator Tree Construction
+; CHECK-NEXT:       Branch Probability Analysis
+; CHECK-NEXT:       Block Frequency Analysis
 ; CHECK-NEXT:       CodeGen Prepare
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Exception handling preparation
diff --git a/llvm/test/CodeGen/X86/packss.ll b/llvm/test/CodeGen/X86/packss.ll
index da739dc277f68..91e4b9b463b0a 100644
--- a/llvm/test/CodeGen/X86/packss.ll
+++ b/llvm/test/CodeGen/X86/packss.ll
@@ -535,13 +535,7 @@ define <64 x i8> @_mm512_packss_epi16_manual(<32 x i16> %a, <32 x i16> %b) nounw
 ;
 ; AVX512-LABEL: _mm512_packss_epi16_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-NEXT:    vpmovswb %zmm3, %ymm0
-; AVX512-NEXT:    vpmovswb %zmm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
 ;
 ; X64-SSE-LABEL: _mm512_packss_epi16_manual:
@@ -688,13 +682,7 @@ define <32 x i16> @_mm512_packss_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun
 ;
 ; AVX512-LABEL: _mm512_packss_epi32_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-NEXT:    vpmovsdw %zmm3, %ymm0
-; AVX512-NEXT:    vpmovsdw %zmm2, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
 ;
 ; X64-SSE-LABEL: _mm512_packss_epi32_manual:
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index 38fd914f1f947..e678991fcec78 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -649,16 +649,7 @@ define <64 x i8> @_mm512_packus_epi16_manual(<32 x i16> %a, <32 x i16> %b) nounw
 ;
 ; AVX512-LABEL: _mm512_packus_epi16_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsw %zmm0, %zmm3, %zmm1
-; AVX512-NEXT:    vpmaxsw %zmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vpmovuswb %zmm0, %ymm0
-; AVX512-NEXT:    vpmovuswb %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
   %sh  = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   %minv = tail call <64 x i16> @llvm.smax.v64i16(<64 x i16> %sh, <64 x i16> splat (i16 0))
@@ -1169,16 +1160,7 @@ define <32 x i16> @_mm512_packus_epi32_manual(<16 x i32> %a, <16 x i32> %b) noun
 ;
 ; AVX512-LABEL: _mm512_packus_epi32_manual:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [0,1,8,9,2,3,10,11]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-NEXT:    vpmovsxbq {{.*#+}} zmm3 = [4,5,12,13,6,7,14,15]
-; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
-; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsd %zmm0, %zmm3, %zmm1
-; AVX512-NEXT:    vpmaxsd %zmm0, %zmm2, %zmm0
-; AVX512-NEXT:    vpmovusdw %zmm0, %ymm0
-; AVX512-NEXT:    vpmovusdw %zmm1, %ymm1
-; AVX512-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
   %sh  = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 24, i32 25, i32 26, i32 27, i32 12, i32 13, i32 14, i32 15, i32 28, i32 29, i32 30, i32 31>
   %minv = tail call <32 x i32> @llvm.smax.v32i32(<32 x i32> %sh, <32 x i32> splat (i32 0))
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index 01cd11b9e712c..bd73de7d50f1b 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -605,7 +605,6 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    vmovdqu64 -128(%rsp,%rax), %zmm1
 ; AVX512F-NEXT:    vpsrlq %xmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpsraq $63, -72(%rsp,%rax){1to8}, %zmm3
-; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm3[7,0,1,2,3,4,5,6]
 ; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
 ; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -639,20 +638,19 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    shrl $3, %ecx
-; AVX512VL-NEXT:    andl $56, %ecx
-; AVX512VL-NEXT:    vpsraq $63, -72(%rsp,%rcx){1to8}, %zmm0
-; AVX512VL-NEXT:    vmovdqu64 -128(%rsp,%rcx), %zmm1
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,15]
-; AVX512VL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm2, %zmm0
-; AVX512VL-NEXT:    andl $63, %eax
-; AVX512VL-NEXT:    vpbroadcastq %rax, %xmm2
-; AVX512VL-NEXT:    vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm3
-; AVX512VL-NEXT:    vpsllq %xmm3, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm1, %zmm1
+; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT:    shrl $3, %eax
+; AVX512VL-NEXT:    andl $56, %eax
+; AVX512VL-NEXT:    vmovdqu64 -128(%rsp,%rax), %zmm1
+; AVX512VL-NEXT:    vpsrlq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpsraq $63, -72(%rsp,%rax){1to2}, %xmm3
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vporq %zmm2, %zmm0, %zmm0
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
 ; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX512VL-NEXT:    popq %rcx
@@ -663,34 +661,33 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    pushq %rax
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    sarq $63, %rdi
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vpbroadcastq %r10, %zmm0
-; AVX512VBMI-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
-; AVX512VBMI-NEXT:    shrl $3, %r10d
-; AVX512VBMI-NEXT:    andl $56, %r10d
-; AVX512VBMI-NEXT:    vpsraq $63, -72(%rsp,%r10){1to8}, %zmm1
-; AVX512VBMI-NEXT:    vmovdqu64 -128(%rsp,%r10), %zmm2
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15]
-; AVX512VBMI-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
-; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm3, %zmm2
+; AVX512VBMI-NEXT:    sarq $63, %r10
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX512VBMI-NEXT:    shrl $3, %edi
+; AVX512VBMI-NEXT:    andl $56, %edi
+; AVX512VBMI-NEXT:    vpsraq $63, -72(%rsp,%rdi){1to2}, %xmm1
+; AVX512VBMI-NEXT:    vmovdqu64 -128(%rsp,%rdi), %zmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm1, %zmm2
 ; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm2, 32(%rax)
 ; AVX512VBMI-NEXT:    vmovdqu %ymm2, (%rax)
 ; AVX512VBMI-NEXT:    popq %rcx
@@ -2247,64 +2244,51 @@ define i512 @shl_allbits_i512(i512 %a0) nounwind {
 ;
 ; AVX512F-LABEL: shl_allbits_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movl %esi, %eax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
-; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT:    shrl $6, %esi
-; AVX512F-NEXT:    movl $-1, %eax
-; AVX512F-NEXT:    shlxl %esi, %eax, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 {%k1} {z} = -1
-; AVX512F-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
-; AVX512F-NEXT:    vpsrlq $1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    movq $-1, %rcx
+; AVX512F-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512F-NEXT:    shrl $6, %esi
+; AVX512F-NEXT:    movl $1, %edi
+; AVX512F-NEXT:    shlxq %rsi, %rdi, %rdi
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512F-NEXT:    kmovw %ecx, %k2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512F-NEXT:    vpbroadcastq %rdx, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shl_allbits_i512:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
-; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    movq $-1, %rcx
+; AVX512VL-NEXT:    shlxq %rsi, %rcx, %rdx
 ; AVX512VL-NEXT:    shrl $6, %esi
-; AVX512VL-NEXT:    movl $-1, %ecx
-; AVX512VL-NEXT:    shlxl %esi, %ecx, %ecx
-; AVX512VL-NEXT:    kmovd %ecx, %k1
-; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm3 {%k1} {z} = -1
-; AVX512VL-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
-; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
-; AVX512VL-NEXT:    vpsrlq $1, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
-; AVX512VL-NEXT:    vporq %zmm0, %zmm2, %zmm0
-; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT:    movl $1, %edi
+; AVX512VL-NEXT:    shlxq %rsi, %rdi, %rdi
+; AVX512VL-NEXT:    kmovd %edi, %k1
+; AVX512VL-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512VL-NEXT:    kmovd %ecx, %k2
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shl_allbits_i512:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    vpbroadcastq %rsi, %zmm0
-; AVX512VBMI-NEXT:    movl %esi, %ecx
-; AVX512VBMI-NEXT:    shrl $6, %ecx
-; AVX512VBMI-NEXT:    movl $-1, %edx
-; AVX512VBMI-NEXT:    shlxl %ecx, %edx, %ecx
-; AVX512VBMI-NEXT:    kmovd %ecx, %k1
-; AVX512VBMI-NEXT:    vpternlogq {{.*#+}} zmm1 {%k1} {z} = -1
-; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm2[7],zmm1[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT:    vpshldvq %zmm0, %zmm2, %zmm1
-; AVX512VBMI-NEXT:    vmovdqu64 %zmm1, (%rdi)
+; AVX512VBMI-NEXT:    movq $-1, %rcx
+; AVX512VBMI-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512VBMI-NEXT:    shrl $6, %esi
+; AVX512VBMI-NEXT:    movl $1, %edi
+; AVX512VBMI-NEXT:    shlxq %rsi, %rdi, %rdi
+; AVX512VBMI-NEXT:    kmovd %edi, %k1
+; AVX512VBMI-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k2
+; AVX512VBMI-NEXT:    vpternlogq {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm0 {%k1}
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = shl i512 -1, %a0
@@ -2418,48 +2402,35 @@ define i512 @lshr_allbits_i512(i512 %a0) nounwind {
 ;
 ; AVX512F-LABEL: lshr_allbits_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movl %esi, %eax
-; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
-; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX512F-NEXT:    shrl $6, %esi
-; AVX512F-NEXT:    movl $-1, %eax
-; AVX512F-NEXT:    shlxl %esi, %eax, %eax
-; AVX512F-NEXT:    kmovw %eax, %k1
-; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
-; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
-; AVX512F-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
-; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
-; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
-; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    movq $-1, %rcx
+; AVX512F-NEXT:    shrxq %rsi, %rcx, %rcx
+; AVX512F-NEXT:    shrl $6, %esi
+; AVX512F-NEXT:    movl $128, %edx
+; AVX512F-NEXT:    shrxq %rsi, %rdx, %rdx
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    movl $255, %edx
+; AVX512F-NEXT:    shrxq %rsi, %rdx, %rdx
+; AVX512F-NEXT:    kmovw %edx, %k2
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512F-NEXT:    vpbroadcastq %rcx, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_allbits_i512:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    movq $-1, %rcx
+; AVX512VL-NEXT:    shrxq %rsi, %rcx, %rcx
 ; AVX512VL-NEXT:    shrl $6, %esi
-; AVX512VL-NEXT:    movl $-1, %ecx
-; AVX512VL-NEXT:    shlxl %esi, %ecx, %ecx
-; AVX512VL-NEXT:    kmovd %ecx, %k1
-; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm2 {%k1} {z}
-; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm3
-; AVX512VL-NEXT:    vpsrlq %xmm3, %zmm2, %zmm3
-; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
-; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
-; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    movl $128, %edx
+; AVX512VL-NEXT:    shrxq %rsi, %rdx, %rdx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    movl $255, %edx
+; AVX512VL-NEXT:    shrxq %rsi, %rdx, %rdx
+; AVX512VL-NEXT:    kmovd %edx, %k2
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %zmm0 {%k1}
 ; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -2467,17 +2438,17 @@ define i512 @lshr_allbits_i512(i512 %a0) nounwind {
 ; AVX512VBMI-LABEL: lshr_allbits_i512:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movl %esi, %ecx
-; AVX512VBMI-NEXT:    shrl $6, %ecx
-; AVX512VBMI-NEXT:    movl $-1, %edx
-; AVX512VBMI-NEXT:    shlxl %ecx, %edx, %ecx
-; AVX512VBMI-NEXT:    kmovd %ecx, %k1
-; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
-; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
-; AVX512VBMI-NEXT:    vpbroadcastq %rsi, %zmm1
-; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
-; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    movq $-1, %rcx
+; AVX512VBMI-NEXT:    shrxq %rsi, %rcx, %rcx
+; AVX512VBMI-NEXT:    shrl $6, %esi
+; AVX512VBMI-NEXT:    movl $128, %edx
+; AVX512VBMI-NEXT:    shrxq %rsi, %rdx, %rdx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    movl $255, %edx
+; AVX512VBMI-NEXT:    shrxq %rsi, %rdx, %rdx
+; AVX512VBMI-NEXT:    kmovd %edx, %k2
+; AVX512VBMI-NEXT:    vpternlogq {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm0 {%k1}
 ; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/tailcc-largecode.ll b/llvm/test/CodeGen/X86/tailcc-largecode.ll
index ac32f67fce9e6..a3762ce81030f 100644
--- a/llvm/test/CodeGen/X86/tailcc-largecode.ll
+++ b/llvm/test/CodeGen/X86/tailcc-largecode.ll
@@ -1,12 +1,20 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false | FileCheck %s --check-prefixes=CHECK,JMP
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false -mattr=jmpabs | FileCheck %s --check-prefixes=CHECK,JMPABS
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false -relocation-model=pic | FileCheck %s --check-prefixes=CHECK,PIC
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -code-model=large -enable-misched=false -mattr=jmpabs -relocation-model=pic | FileCheck %s --check-prefixes=CHECK,PIC
 
 declare tailcc i32 @callee(i32 %arg)
 define tailcc i32 @directcall(i32 %arg) {
 entry:
 ; This is the large code model, so &callee may not fit into the jmp
 ; instruction.  Instead, stick it into a register.
-;  CHECK: movabsq $callee, [[REGISTER:%r[a-z0-9]+]]
-;  CHECK: jmpq    *[[REGISTER]]  # TAILCALL
+;  JMP: movabsq $callee, [[REGISTER:%r[a-z0-9]+]]
+;  JMP: jmpq    *[[REGISTER]]  # TAILCALL
+;  JMPABS-NOT: movabsq
+;  JMPABS:     jmpabs $callee  # TAILCALL
+;  PIC: movabsq $_GLOBAL_OFFSET_TABLE_
+;  PIC: movabsq $callee at GOT
+;  PIC: jmpq    *
   %res = tail call tailcc i32 @callee(i32 %arg)
   ret i32 %res
 }
@@ -48,12 +56,15 @@ define tailcc i32 @direct_manyargs() {
 ; the stack argument and the return adjustment will change too.)
 ;  CHECK: pushq
 ; Pass the stack argument.
+;  PIC: movabsq $_GLOBAL_OFFSET_TABLE
 ;  CHECK: movl $7, 16(%rsp)
 ; This is the large code model, so &manyargs_callee may not fit into
 ; the jmp instruction.  Put it into a register which won't be clobbered
 ; while restoring callee-saved registers and won't be used for passing
 ; arguments.
-;  CHECK: movabsq $manyargs_callee, %rax
+;  JMP: movabsq $manyargs_callee, %rax
+;  JMPABS-NOT: movabsq
+;  PIC: movabsq $manyargs_callee at GOT
 ; Pass the register arguments, in the right registers.
 ;  CHECK: movl $1, %edi
 ;  CHECK: movl $2, %esi
@@ -64,7 +75,9 @@ define tailcc i32 @direct_manyargs() {
 ; Adjust the stack to "return".
 ;  CHECK: popq
 ; And tail-call to the target.
-;  CHECK: jmpq *%rax  # TAILCALL
+;  JMP: jmpq *%rax  # TAILCALL
+;  JMPABS: jmpabs $manyargs_callee  # TAILCALL
+;  PIC: jmpq    *
   %res = tail call tailcc i32 @manyargs_callee(i32 1, i32 2, i32 3, i32 4,
                                                i32 5, i32 6, i32 7)
   ret i32 %res
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
index 3d4af023de799..2cc65b111ace2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll
@@ -1083,6 +1083,17 @@ define <16 x i32> @combine_vcompressd_as_vmov(<16 x i32> %x) {
   ret <16 x i32> %res
 }
 
+; compress of repeated splat args
+define <16 x i32> @combine_vcompressd_splat(i16 %m) {
+; CHECK-LABEL: combine_vcompressd_splat:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
+; CHECK-NEXT:    ret{{[l|q]}}
+  %msk = bitcast i16 %m to <16 x i1>
+  %res = call <16 x i32> @llvm.x86.avx512.mask.compress.v16i32(<16 x i32> splat (i32 -1), <16 x i32> splat (i32 -1), <16 x i1> %msk)
+  ret <16 x i32> %res
+}
+
 define <8 x i64> @PR179008(ptr %p0) {
 ; X86-AVX512F-LABEL: PR179008:
 ; X86-AVX512F:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index b012c7b7bea28..355e9c326c68e 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -517,18 +517,30 @@ define <4 x i32> @combine_bitwise_ops_test2c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test3c:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm1, %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test3c:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm1, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test3c:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm0
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test3c:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorps %xmm1, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_bitwise_ops_test3c:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
 ; AVX-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
@@ -575,19 +587,32 @@ define <4 x i32> @combine_bitwise_ops_test5c(<4 x i32> %a, <4 x i32> %b, <4 x i3
 }
 
 define <4 x i32> @combine_bitwise_ops_test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
-; SSE-LABEL: combine_bitwise_ops_test6c:
-; SSE:       # %bb.0:
-; SSE-NEXT:    xorps %xmm1, %xmm0
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
-; SSE-NEXT:    movaps %xmm1, %xmm0
-; SSE-NEXT:    retq
+; SSE2-LABEL: combine_bitwise_ops_test6c:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm1, %xmm0
+; SSE2-NEXT:    xorps %xmm1, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
+; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: combine_bitwise_ops_test6c:
+; SSSE3:       # %bb.0:
+; SSSE3-NEXT:    xorps %xmm1, %xmm0
+; SSSE3-NEXT:    xorps %xmm1, %xmm1
+; SSSE3-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[1,3]
+; SSSE3-NEXT:    movaps %xmm1, %xmm0
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: combine_bitwise_ops_test6c:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    xorps %xmm1, %xmm0
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
+; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_bitwise_ops_test6c:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,1],xmm0[1,3]
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = zero,zero,xmm0[1,3]
 ; AVX-NEXT:    retq
   %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
   %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 304508d9f86c4..a27e3318c59e8 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -503,80 +503,6 @@ entry:
   ret void
 }
 
-declare float @llvm.sin.f32(float) readnone
-
-define float @call_sin_intrinsic() sanitize_numerical_stability {
-; CHECK-LABEL: @call_sin_intrinsic(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call float @llvm.sin.f32(float 1.000000e+00)
-; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
-; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sin_intrinsic to i64), ptr @__nsan_shadow_ret_tag, align 8
-; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
-; CHECK-NEXT:    ret float [[R]]
-;
-entry:
-  %r = call float @llvm.sin.f32(float 1.0)
-  ret float %r
-}
-
-declare float @sinf(float)
-
-define float @call_sinf_libfunc() sanitize_numerical_stability {
-; CHECK-LABEL: @call_sinf_libfunc(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[R:%.*]] = call float @sinf(float 1.000000e+00) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
-; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sinf_libfunc to i64), ptr @__nsan_shadow_ret_tag, align 8
-; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
-; CHECK-NEXT:    ret float [[R]]
-;
-entry:
-  %r = call float @sinf(float 1.0)
-  ret float %r
-}
-
-declare double @sin(double)
-
-; FIXME: nsan uses `sin(double)` for fp128.
-define double @call_sin_libfunc() sanitize_numerical_stability {
-; DQQ-LABEL: @call_sin_libfunc(
-; DQQ-NEXT:  entry:
-; DQQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #[[ATTR3]]
-; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
-; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
-; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
-; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
-; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
-; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
-; DQQ-NEXT:    store i64 ptrtoint (ptr @call_sin_libfunc to i64), ptr @__nsan_shadow_ret_tag, align 8
-; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
-; DQQ-NEXT:    ret double [[R]]
-;
-; DLQ-LABEL: @call_sin_libfunc(
-; DLQ-NEXT:  entry:
-; DLQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #[[ATTR3]]
-; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
-; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
-; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
-; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
-; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
-; DLQ-NEXT:    store i64 ptrtoint (ptr @call_sin_libfunc to i64), ptr @__nsan_shadow_ret_tag, align 8
-; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
-; DLQ-NEXT:    ret double [[R]]
-;
-entry:
-  %r = call double @sin(double 1.0)
-  ret double %r
-}
-
 declare double @frexp(double, i32*)
 
 define double @call_frexp_libfunc_nointrinsic(double %0, i32* nocapture %1) sanitize_numerical_stability {
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/intrinsics.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/intrinsics.ll
new file mode 100644
index 0000000000000..b2d67bc9816d8
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/intrinsics.ll
@@ -0,0 +1,1809 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DQQ
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dlq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DLQ
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; FIXME: nsan is currently using long double (e.g. sin.f80) instead of quad (sin.f128).
+
+;###############################################################
+;#                     sqrt                                    #
+;###############################################################
+declare float @llvm.sqrt.f32(float) readnone
+declare double @llvm.sqrt.f64(double) readnone
+declare x86_fp80 @llvm.sqrt.f80(x86_fp80) readnone
+
+define float @call_sqrt_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sqrt_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.sqrt.f32(float 4.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sqrt.f64(double 4.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sqrt_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.sqrt.f32(float 4.0)
+  ret float %r
+}
+
+define double @call_sqrt_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_sqrt_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.sqrt.f64(double 4.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_sqrt_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_sqrt_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.sqrt.f64(double 4.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_sqrt_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.sqrt.f64(double 4.0)
+  ret double %r
+}
+
+define x86_fp80 @call_sqrt_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sqrt_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sqrt_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     powi                                    #
+;###############################################################
+declare float @llvm.powi.f32(float, i32) readnone
+declare double @llvm.powi.f64(double, i32) readnone
+declare x86_fp80 @llvm.powi.f80(x86_fp80, i32) readnone
+
+define float @call_powi_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_powi_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.powi.f32.i32(float 2.000000e+00, i32 3)
+; CHECK-NEXT:    [[TMP0:%.*]] = call float @llvm.powi.f32.i32(float 2.000000e+00, i32 3)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext float [[TMP0]] to double
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], double [[TMP4]], double [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_powi_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.powi.f32(float 2.0, i32 3)
+  ret float %r
+}
+
+define double @call_powi_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_powi_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.powi.f64.i32(double 2.000000e+00, i32 3)
+; DQQ-NEXT:    [[TMP0:%.*]] = call double @llvm.powi.f64.i32(double 2.000000e+00, i32 3)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext double [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_powi_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_powi_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.powi.f64.i32(double 2.000000e+00, i32 3)
+; DLQ-NEXT:    [[TMP0:%.*]] = call double @llvm.powi.f64.i32(double 2.000000e+00, i32 3)
+; DLQ-NEXT:    [[TMP1:%.*]] = fpext double [[TMP0]] to x86_fp80
+; DLQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP1]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DLQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], x86_fp80 [[TMP4]], x86_fp80 [[TMP1]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_powi_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.powi.f64(double 2.0, i32 3)
+  ret double %r
+}
+
+define x86_fp80 @call_powi_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_powi_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.powi.f80.i32(x86_fp80 0xK40008000000000000000, i32 3)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.powi.f80.i32(x86_fp80 0xK40008000000000000000, i32 3)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_powi_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.powi.f80(x86_fp80 0xK40008000000000000000, i32 3)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      sin                                    #
+;###############################################################
+declare float @llvm.sin.f32(float) readnone
+declare double @llvm.sin.f64(double) readnone
+declare x86_fp80 @llvm.sin.f80(x86_fp80) readnone
+
+define float @call_sin_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sin_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.sin.f32(float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sin_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.sin.f32(float 1.0)
+  ret float %r
+}
+
+define double @call_sin_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_sin_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_sin_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_sin_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_sin_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.sin.f64(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_sin_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sin_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sin_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      cos                                    #
+;###############################################################
+declare float @llvm.cos.f32(float) readnone
+declare double @llvm.cos.f64(double) readnone
+declare x86_fp80 @llvm.cos.f80(x86_fp80) readnone
+
+define float @call_cos_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_cos_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.cos.f32(float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.cos.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_cos_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.cos.f32(float 1.0)
+  ret float %r
+}
+
+define double @call_cos_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_cos_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.cos.f64(double 1.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_cos_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_cos_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.cos.f64(double 1.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_cos_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.cos.f64(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_cos_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_cos_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_cos_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      pow                                    #
+;###############################################################
+declare float @llvm.pow.f32(float, float) readnone
+declare double @llvm.pow.f64(double, double) readnone
+declare x86_fp80 @llvm.pow.f80(x86_fp80, x86_fp80) readnone
+
+define float @call_pow_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_pow_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.pow.f32(float 2.000000e+00, float 3.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.pow.f64(double 2.000000e+00, double 3.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_pow_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.pow.f32(float 2.0, float 3.0)
+  ret float %r
+}
+
+define double @call_pow_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_pow_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.pow.f64(double 2.000000e+00, double 3.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_pow_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_pow_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.pow.f64(double 2.000000e+00, double 3.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_pow_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.pow.f64(double 2.0, double 3.0)
+  ret double %r
+}
+
+define x86_fp80 @call_pow_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_pow_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_pow_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      exp                                    #
+;###############################################################
+declare float @llvm.exp.f32(float) readnone
+declare double @llvm.exp.f64(double) readnone
+declare x86_fp80 @llvm.exp.f80(x86_fp80) readnone
+
+define float @call_exp_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_exp_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.exp.f32(float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.exp.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_exp_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.exp.f32(float 1.0)
+  ret float %r
+}
+
+define double @call_exp_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_exp_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.exp.f64(double 1.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_exp_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_exp_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.exp.f64(double 1.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_exp_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.exp.f64(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_exp_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_exp_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_exp_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     exp2                                    #
+;###############################################################
+declare float @llvm.exp2.f32(float) readnone
+declare double @llvm.exp2.f64(double) readnone
+declare x86_fp80 @llvm.exp2.f80(x86_fp80) readnone
+
+define float @call_exp2_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_exp2_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.exp2.f32(float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.exp2.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_exp2_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.exp2.f32(float 1.0)
+  ret float %r
+}
+
+define double @call_exp2_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_exp2_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.exp2.f64(double 1.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_exp2_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_exp2_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.exp2.f64(double 1.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_exp2_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.exp2.f64(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_exp2_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_exp2_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_exp2_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      log                                    #
+;###############################################################
+declare float @llvm.log.f32(float) readnone
+declare double @llvm.log.f64(double) readnone
+declare x86_fp80 @llvm.log.f80(x86_fp80) readnone
+
+define float @call_log_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.log.f32(float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.log.f64(double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.log.f32(float 2.0)
+  ret float %r
+}
+
+define double @call_log_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_log_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.log.f64(double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_log_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_log_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.log.f64(double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_log_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.log.f64(double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_log_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                    log10                                    #
+;###############################################################
+declare float @llvm.log10.f32(float) readnone
+declare double @llvm.log10.f64(double) readnone
+declare x86_fp80 @llvm.log10.f80(x86_fp80) readnone
+
+define float @call_log10_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log10_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.log10.f32(float 1.000000e+02)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.log10.f64(double 1.000000e+02)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log10_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.log10.f32(float 100.0)
+  ret float %r
+}
+
+define double @call_log10_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_log10_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.log10.f64(double 1.000000e+02)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK4005C800000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_log10_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_log10_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.log10.f64(double 1.000000e+02)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK4005C800000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_log10_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.log10.f64(double 100.0)
+  ret double %r
+}
+
+define x86_fp80 @call_log10_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log10_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK7FFF0000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK7FFF0000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log10_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK40590000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     log2                                    #
+;###############################################################
+declare float @llvm.log2.f32(float) readnone
+declare double @llvm.log2.f64(double) readnone
+declare x86_fp80 @llvm.log2.f80(x86_fp80) readnone
+
+define float @call_log2_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log2_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.log2.f32(float 8.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.log2.f64(double 8.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log2_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.log2.f32(float 8.0)
+  ret float %r
+}
+
+define double @call_log2_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_log2_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.log2.f64(double 8.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40028000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_log2_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_log2_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.log2.f64(double 8.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40028000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_log2_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.log2.f64(double 8.0)
+  ret double %r
+}
+
+define x86_fp80 @call_log2_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log2_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log2_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      fma                                    #
+;###############################################################
+declare float @llvm.fma.f32(float, float, float) readnone
+declare double @llvm.fma.f64(double, double, double) readnone
+declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) readnone
+
+define float @call_fma_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fma_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.fma.f32(float 1.000000e+00, float 2.000000e+00, float 3.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fma.f64(double 1.000000e+00, double 2.000000e+00, double 3.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fma_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.fma.f32(float 1.0, float 2.0, float 3.0)
+  ret float %r
+}
+
+define double @call_fma_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_fma_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.fma.f64(double 1.000000e+00, double 2.000000e+00, double 3.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fma.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_fma_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_fma_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.fma.f64(double 1.000000e+00, double 2.000000e+00, double 3.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fma.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_fma_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.fma.f64(double 1.0, double 2.0, double 3.0)
+  ret double %r
+}
+
+define x86_fp80 @call_fma_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fma_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.fma.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fma.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fma_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.fma.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                   fmuladd                                   #
+;###############################################################
+declare float @llvm.fmuladd.f32(float, float, float) readnone
+declare double @llvm.fmuladd.f64(double, double, double) readnone
+declare x86_fp80 @llvm.fmuladd.f80(x86_fp80, x86_fp80, x86_fp80) readnone
+
+define float @call_fmuladd_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fmuladd_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.fmuladd.f32(float 1.000000e+00, float 2.000000e+00, float 3.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fmuladd.f64(double 1.000000e+00, double 2.000000e+00, double 3.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fmuladd_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.fmuladd.f32(float 1.0, float 2.0, float 3.0)
+  ret float %r
+}
+
+define double @call_fmuladd_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_fmuladd_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.fmuladd.f64(double 1.000000e+00, double 2.000000e+00, double 3.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fmuladd.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_fmuladd_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_fmuladd_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.fmuladd.f64(double 1.000000e+00, double 2.000000e+00, double 3.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fmuladd.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_fmuladd_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.fmuladd.f64(double 1.0, double 2.0, double 3.0)
+  ret double %r
+}
+
+define x86_fp80 @call_fmuladd_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fmuladd_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fmuladd.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP6:%.*]] = call x86_fp80 @llvm.fmuladd.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP6]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[TMP0]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fmuladd_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[TMP0]]
+;
+entry:
+  %r = call x86_fp80 @llvm.fmuladd.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      fabs                                   #
+;###############################################################
+declare float @llvm.fabs.f32(float) readnone
+declare double @llvm.fabs.f64(double) readnone
+declare x86_fp80 @llvm.fabs.f80(x86_fp80) readnone
+
+define float @call_fabs_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fabs_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.fabs.f32(float -1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double -1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fabs_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.fabs.f32(float -1.0)
+  ret float %r
+}
+
+define double @call_fabs_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_fabs_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.fabs.f64(double -1.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_fabs_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_fabs_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.fabs.f64(double -1.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_fabs_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.fabs.f64(double -1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_fabs_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fabs_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fabs_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                    minnum                                   #
+;###############################################################
+declare float @llvm.minnum.f32(float, float) readnone
+declare double @llvm.minnum.f64(double, double) readnone
+declare x86_fp80 @llvm.minnum.f80(x86_fp80, x86_fp80) readnone
+
+define float @call_minnum_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_minnum_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.minnum.f32(float 1.000000e+00, float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.minnum.f64(double 1.000000e+00, double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_minnum_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.minnum.f32(float 1.0, float 2.0)
+  ret float %r
+}
+
+define double @call_minnum_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_minnum_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.minnum.f64(double 1.000000e+00, double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_minnum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_minnum_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.minnum.f64(double 1.000000e+00, double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_minnum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.minnum.f64(double 1.0, double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_minnum_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_minnum_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_minnum_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                    maxnum                                   #
+;###############################################################
+declare float @llvm.maxnum.f32(float, float) readnone
+declare double @llvm.maxnum.f64(double, double) readnone
+declare x86_fp80 @llvm.maxnum.f80(x86_fp80, x86_fp80) readnone
+
+define float @call_maxnum_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_maxnum_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.maxnum.f32(float 1.000000e+00, float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.maxnum.f64(double 1.000000e+00, double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_maxnum_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.maxnum.f32(float 1.0, float 2.0)
+  ret float %r
+}
+
+define double @call_maxnum_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_maxnum_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.maxnum.f64(double 1.000000e+00, double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_maxnum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_maxnum_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.maxnum.f64(double 1.000000e+00, double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_maxnum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.maxnum.f64(double 1.0, double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_maxnum_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_maxnum_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_maxnum_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                   minimum                                   #
+;###############################################################
+declare float @llvm.minimum.f32(float, float) readnone
+declare double @llvm.minimum.f64(double, double) readnone
+declare x86_fp80 @llvm.minimum.f80(x86_fp80, x86_fp80) readnone
+
+define float @call_minimum_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_minimum_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.minimum.f32(float 1.000000e+00, float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.minimum.f64(double 1.000000e+00, double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_minimum_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.minimum.f32(float 1.0, float 2.0)
+  ret float %r
+}
+
+define double @call_minimum_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_minimum_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.minimum.f64(double 1.000000e+00, double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minimum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_minimum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_minimum_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.minimum.f64(double 1.000000e+00, double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minimum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_minimum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.minimum.f64(double 1.0, double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_minimum_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_minimum_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.minimum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minimum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_minimum_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.minimum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                   maximum                                   #
+;###############################################################
+declare float @llvm.maximum.f32(float, float) readnone
+declare double @llvm.maximum.f64(double, double) readnone
+declare x86_fp80 @llvm.maximum.f80(x86_fp80, x86_fp80) readnone
+
+define float @call_maximum_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_maximum_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.maximum.f32(float 1.000000e+00, float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.maximum.f64(double 1.000000e+00, double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_maximum_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.maximum.f32(float 1.0, float 2.0)
+  ret float %r
+}
+
+define double @call_maximum_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_maximum_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.maximum.f64(double 1.000000e+00, double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maximum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_maximum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_maximum_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.maximum.f64(double 1.000000e+00, double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maximum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_maximum_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.maximum.f64(double 1.0, double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_maximum_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_maximum_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.maximum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maximum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_maximum_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.maximum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                  copysign                                   #
+;###############################################################
+declare float @llvm.copysign.f32(float, float) readnone
+declare double @llvm.copysign.f64(double, double) readnone
+declare x86_fp80 @llvm.copysign.f80(x86_fp80, x86_fp80) readnone
+
+define float @call_copysign_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_copysign_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.copysign.f32(float 1.000000e+00, float -2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.copysign.f64(double 1.000000e+00, double -2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_copysign_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.copysign.f32(float 1.0, float -2.0)
+  ret float %r
+}
+
+define double @call_copysign_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_copysign_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.copysign.f64(double 1.000000e+00, double -2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKC0008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_copysign_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_copysign_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.copysign.f64(double 1.000000e+00, double -2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKC0008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_copysign_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.copysign.f64(double 1.0, double -2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_copysign_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_copysign_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKBFFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKBFFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_copysign_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKBFFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     floor                                   #
+;###############################################################
+declare float @llvm.floor.f32(float) readnone
+declare double @llvm.floor.f64(double) readnone
+declare x86_fp80 @llvm.floor.f80(x86_fp80) readnone
+
+define float @call_floor_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_floor_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.floor.f32(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.floor.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_floor_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.floor.f32(float 1.5)
+  ret float %r
+}
+
+define double @call_floor_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_floor_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.floor.f64(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_floor_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_floor_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.floor.f64(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_floor_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.floor.f64(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_floor_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_floor_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_floor_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      ceil                                   #
+;###############################################################
+declare float @llvm.ceil.f32(float) readnone
+declare double @llvm.ceil.f64(double) readnone
+declare x86_fp80 @llvm.ceil.f80(x86_fp80) readnone
+
+define float @call_ceil_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_ceil_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.ceil.f32(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.ceil.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_ceil_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.ceil.f32(float 1.5)
+  ret float %r
+}
+
+define double @call_ceil_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_ceil_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.ceil.f64(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_ceil_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_ceil_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.ceil.f64(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_ceil_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.ceil.f64(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_ceil_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_ceil_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_ceil_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     trunc                                   #
+;###############################################################
+declare float @llvm.trunc.f32(float) readnone
+declare double @llvm.trunc.f64(double) readnone
+declare x86_fp80 @llvm.trunc.f80(x86_fp80) readnone
+
+define float @call_trunc_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_trunc_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.trunc.f32(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.trunc.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_trunc_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.trunc.f32(float 1.5)
+  ret float %r
+}
+
+define double @call_trunc_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_trunc_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.trunc.f64(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_trunc_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_trunc_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.trunc.f64(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_trunc_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.trunc.f64(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_trunc_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_trunc_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_trunc_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      rint                                   #
+;###############################################################
+declare float @llvm.rint.f32(float) readnone
+declare double @llvm.rint.f64(double) readnone
+declare x86_fp80 @llvm.rint.f80(x86_fp80) readnone
+
+define float @call_rint_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_rint_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.rint.f32(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.rint.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_rint_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.rint.f32(float 1.5)
+  ret float %r
+}
+
+define double @call_rint_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_rint_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.rint.f64(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_rint_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_rint_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.rint.f64(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_rint_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.rint.f64(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_rint_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_rint_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_rint_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                  nearbyint                                   #
+;###############################################################
+declare float @llvm.nearbyint.f32(float) readnone
+declare double @llvm.nearbyint.f64(double) readnone
+declare x86_fp80 @llvm.nearbyint.f80(x86_fp80) readnone
+
+define float @call_nearbyint_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_nearbyint_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.nearbyint.f32(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.nearbyint.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_nearbyint_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.nearbyint.f32(float 1.5)
+  ret float %r
+}
+
+define double @call_nearbyint_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_nearbyint_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.nearbyint.f64(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_nearbyint_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_nearbyint_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.nearbyint.f64(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_nearbyint_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.nearbyint.f64(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_nearbyint_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_nearbyint_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_nearbyint_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     round                                   #
+;###############################################################
+declare float @llvm.round.f32(float) readnone
+declare double @llvm.round.f64(double) readnone
+declare x86_fp80 @llvm.round.f80(x86_fp80) readnone
+
+define float @call_round_f32() sanitize_numerical_stability {
+; CHECK-LABEL: @call_round_f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @llvm.round.f32(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.round.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_round_f32 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @llvm.round.f32(float 1.5)
+  ret float %r
+}
+
+define double @call_round_f64() sanitize_numerical_stability {
+; DQQ-LABEL: @call_round_f64(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @llvm.round.f64(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_round_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_round_f64(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @llvm.round.f64(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_round_f64 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @llvm.round.f64(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_round_f80() sanitize_numerical_stability {
+; CHECK-LABEL: @call_round_f80(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_round_f80 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/libfuncs.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/libfuncs.ll
new file mode 100644
index 0000000000000..e9ef2104fd370
--- /dev/null
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/libfuncs.ll
@@ -0,0 +1,1432 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DQQ
+; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dlq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DLQ
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; FIXME: nsan is currently using long double (e.g. sin.f80) instead of quad (sin.f128).
+
+;###############################################################
+;#                     sqrt                                    #
+;###############################################################
+declare float @sqrtf(float)
+declare double @sqrt(double)
+declare x86_fp80 @sqrtl(x86_fp80)
+
+define float @call_sqrtf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sqrtf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @sqrtf(float 4.000000e+00) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sqrt.f64(double 4.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sqrtf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @sqrtf(float 4.0)
+  ret float %r
+}
+
+define double @call_sqrt() sanitize_numerical_stability {
+; DQQ-LABEL: @call_sqrt(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @sqrt(double 4.000000e+00) #[[ATTR3]]
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_sqrt to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_sqrt(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @sqrt(double 4.000000e+00) #[[ATTR3]]
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_sqrt to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @sqrt(double 4.0)
+  ret double %r
+}
+
+define x86_fp80 @call_sqrtl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sqrtl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @sqrtl(x86_fp80 0xK40018000000000000000) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sqrt.f80(x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sqrtl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @sqrtl(x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      sin                                    #
+;###############################################################
+declare float @sinf(float)
+declare double @sin(double)
+declare x86_fp80 @sinl(x86_fp80)
+
+define float @call_sinf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sinf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @sinf(float 1.000000e+00) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.sin.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sinf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @sinf(float 1.0)
+  ret float %r
+}
+
+define double @call_sin() sanitize_numerical_stability {
+; DQQ-LABEL: @call_sin(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #[[ATTR3]]
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_sin to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_sin(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @sin(double 1.000000e+00) #[[ATTR3]]
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_sin to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @sin(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_sinl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_sinl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @sinl(x86_fp80 0xK3FFF8000000000000000) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.sin.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_sinl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @sinl(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      cos                                    #
+;###############################################################
+declare float @cosf(float)
+declare double @cos(double)
+declare x86_fp80 @cosl(x86_fp80)
+
+define float @call_cosf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_cosf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @cosf(float 1.000000e+00) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.cos.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_cosf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @cosf(float 1.0)
+  ret float %r
+}
+
+define double @call_cos() sanitize_numerical_stability {
+; DQQ-LABEL: @call_cos(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @cos(double 1.000000e+00) #[[ATTR3]]
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_cos to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_cos(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @cos(double 1.000000e+00) #[[ATTR3]]
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_cos to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @cos(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_cosl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_cosl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @cosl(x86_fp80 0xK3FFF8000000000000000) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.cos.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_cosl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @cosl(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      pow                                    #
+;###############################################################
+declare float @powf(float, float)
+declare double @pow(double, double)
+declare x86_fp80 @powl(x86_fp80, x86_fp80)
+
+define float @call_powf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_powf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @powf(float 2.000000e+00, float 3.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.pow.f64(double 2.000000e+00, double 3.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_powf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @powf(float 2.0, float 3.0)
+  ret float %r
+}
+
+define double @call_pow() sanitize_numerical_stability {
+; DQQ-LABEL: @call_pow(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @pow(double 2.000000e+00, double 3.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_pow to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_pow(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @pow(double 2.000000e+00, double 3.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK4000C000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_pow to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @pow(double 2.0, double 3.0)
+  ret double %r
+}
+
+define x86_fp80 @call_powl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_powl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @powl(x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.pow.f80(x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_powl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @powl(x86_fp80 0xK40008000000000000000, x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      exp                                    #
+;###############################################################
+declare float @expf(float)
+declare double @exp(double)
+declare x86_fp80 @expl(x86_fp80)
+
+define float @call_expf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_expf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @expf(float 1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.exp.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_expf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @expf(float 1.0)
+  ret float %r
+}
+
+define double @call_exp() sanitize_numerical_stability {
+; DQQ-LABEL: @call_exp(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @exp(double 1.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_exp to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_exp(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @exp(double 1.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_exp to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @exp(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_expl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_expl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @expl(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_expl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @expl(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     exp2                                    #
+;###############################################################
+declare float @exp2f(float)
+declare double @exp2(double)
+declare x86_fp80 @exp2l(x86_fp80)
+
+define float @call_exp2f() sanitize_numerical_stability {
+; CHECK-LABEL: @call_exp2f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @exp2f(float 1.000000e+00) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.exp2.f64(double 1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_exp2f to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @exp2f(float 1.0)
+  ret float %r
+}
+
+define double @call_exp2() sanitize_numerical_stability {
+; DQQ-LABEL: @call_exp2(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @exp2(double 1.000000e+00) #[[ATTR3]]
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_exp2 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_exp2(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @exp2(double 1.000000e+00) #[[ATTR3]]
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_exp2 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @exp2(double 1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_exp2l() sanitize_numerical_stability {
+; CHECK-LABEL: @call_exp2l(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @exp2l(x86_fp80 0xK3FFF8000000000000000) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.exp2.f80(x86_fp80 0xK3FFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_exp2l to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @exp2l(x86_fp80 0xK3FFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      log                                    #
+;###############################################################
+declare float @logf(float)
+declare double @log(double)
+declare x86_fp80 @logl(x86_fp80)
+
+define float @call_logf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_logf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @logf(float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.log.f64(double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_logf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @logf(float 2.0)
+  ret float %r
+}
+
+define double @call_log() sanitize_numerical_stability {
+; DQQ-LABEL: @call_log(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @log(double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_log to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_log(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @log(double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_log to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @log(double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_logl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_logl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @logl(x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log.f80(x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_logl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @logl(x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                    log10                                   #
+;###############################################################
+declare float @log10f(float)
+declare double @log10(double)
+declare x86_fp80 @log10l(x86_fp80)
+
+define float @call_log10f() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log10f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @log10f(float 1.000000e+02)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.log10.f64(double 1.000000e+02)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log10f to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @log10f(float 100.0)
+  ret float %r
+}
+
+define double @call_log10() sanitize_numerical_stability {
+; DQQ-LABEL: @call_log10(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @log10(double 1.000000e+02)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK4005C800000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_log10 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_log10(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @log10(double 1.000000e+02)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK4005C800000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_log10 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @log10(double 100.0)
+  ret double %r
+}
+
+define x86_fp80 @call_log10l() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log10l(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @log10l(x86_fp80 0xK7FFF0000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log10.f80(x86_fp80 0xK7FFF0000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log10l to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @log10l(x86_fp80 0xK40590000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     log2                                   #
+;###############################################################
+declare float @log2f(float)
+declare double @log2(double)
+declare x86_fp80 @log2l(x86_fp80)
+
+define float @call_log2f() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log2f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @log2f(float 8.000000e+00) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.log2.f64(double 8.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log2f to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @log2f(float 8.0)
+  ret float %r
+}
+
+define double @call_log2() sanitize_numerical_stability {
+; DQQ-LABEL: @call_log2(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @log2(double 8.000000e+00) #[[ATTR3]]
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40028000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_log2 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_log2(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @log2(double 8.000000e+00) #[[ATTR3]]
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40028000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_log2 to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @log2(double 8.0)
+  ret double %r
+}
+
+define x86_fp80 @call_log2l() sanitize_numerical_stability {
+; CHECK-LABEL: @call_log2l(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @log2l(x86_fp80 0xK40018000000000000000) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.log2.f80(x86_fp80 0xK40018000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_log2l to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @log2l(x86_fp80 0xK40018000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      fabs                                   #
+;###############################################################
+declare float @fabsf(float)
+declare double @fabs(double)
+declare x86_fp80 @fabsl(x86_fp80)
+
+define float @call_fabsf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fabsf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @fabsf(float -1.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.fabs.f64(double -1.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fabsf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @fabsf(float -1.0)
+  ret float %r
+}
+
+define double @call_fabs() sanitize_numerical_stability {
+; DQQ-LABEL: @call_fabs(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @fabs(double -1.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_fabs to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_fabs(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @fabs(double -1.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_fabs to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @fabs(double -1.0)
+  ret double %r
+}
+
+define x86_fp80 @call_fabsl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fabsl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @fabsl(x86_fp80 0xKBFFF8000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.fabs.f80(x86_fp80 0xKBFFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fabsl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @fabsl(x86_fp80 0xKBFFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                  copysign                                   #
+;###############################################################
+declare float @copysignf(float, float)
+declare double @copysign(double, double)
+declare x86_fp80 @copysignl(x86_fp80, x86_fp80)
+
+define float @call_copysignf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_copysignf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @copysignf(float 1.000000e+00, float -2.000000e+00) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.copysign.f64(double 1.000000e+00, double -2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_copysignf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @copysignf(float 1.0, float -2.0)
+  ret float %r
+}
+
+define double @call_copysign() sanitize_numerical_stability {
+; DQQ-LABEL: @call_copysign(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @copysign(double 1.000000e+00, double -2.000000e+00) #[[ATTR3]]
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKC0008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_copysign to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_copysign(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @copysign(double 1.000000e+00, double -2.000000e+00) #[[ATTR3]]
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKC0008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_copysign to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @copysign(double 1.0, double -2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_copysignl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_copysignl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @copysignl(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKBFFF8000000000000000) #[[ATTR3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.copysign.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKBFFF8000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_copysignl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @copysignl(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xKBFFF8000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     floor                                   #
+;###############################################################
+declare float @floorf(float)
+declare double @floor(double)
+declare x86_fp80 @floorl(x86_fp80)
+
+define float @call_floorf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_floorf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @floorf(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.floor.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_floorf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @floorf(float 1.5)
+  ret float %r
+}
+
+define double @call_floor() sanitize_numerical_stability {
+; DQQ-LABEL: @call_floor(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @floor(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_floor to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_floor(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @floor(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_floor to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @floor(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_floorl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_floorl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @floorl(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.floor.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_floorl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @floorl(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      fmax                                   #
+;###############################################################
+declare float @fmaxf(float, float)
+declare double @fmax(double, double)
+declare x86_fp80 @fmaxl(x86_fp80, x86_fp80)
+
+define float @call_fmaxf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fmaxf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @fmaxf(float 1.000000e+00, float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.maxnum.f64(double 1.000000e+00, double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fmaxf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @fmaxf(float 1.0, float 2.0)
+  ret float %r
+}
+
+define double @call_fmax() sanitize_numerical_stability {
+; DQQ-LABEL: @call_fmax(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @fmax(double 1.000000e+00, double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_fmax to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_fmax(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @fmax(double 1.000000e+00, double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_fmax to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @fmax(double 1.0, double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_fmaxl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fmaxl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @fmaxl(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.maxnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fmaxl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @fmaxl(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      fmin                                   #
+;###############################################################
+declare float @fminf(float, float)
+declare double @fmin(double, double)
+declare x86_fp80 @fminl(x86_fp80, x86_fp80)
+
+define float @call_fminf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fminf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @fminf(float 1.000000e+00, float 2.000000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.minnum.f64(double 1.000000e+00, double 2.000000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fminf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @fminf(float 1.0, float 2.0)
+  ret float %r
+}
+
+define double @call_fmin() sanitize_numerical_stability {
+; DQQ-LABEL: @call_fmin(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @fmin(double 1.000000e+00, double 2.000000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_fmin to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_fmin(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @fmin(double 1.000000e+00, double 2.000000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_fmin to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @fmin(double 1.0, double 2.0)
+  ret double %r
+}
+
+define x86_fp80 @call_fminl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_fminl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @fminl(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.minnum.f80(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_fminl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @fminl(x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      ceil                                   #
+;###############################################################
+declare float @ceilf(float)
+declare double @ceil(double)
+declare x86_fp80 @ceill(x86_fp80)
+
+define float @call_ceilf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_ceilf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @ceilf(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.ceil.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_ceilf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @ceilf(float 1.5)
+  ret float %r
+}
+
+define double @call_ceil() sanitize_numerical_stability {
+; DQQ-LABEL: @call_ceil(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @ceil(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_ceil to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_ceil(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @ceil(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_ceil to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @ceil(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_ceill() sanitize_numerical_stability {
+; CHECK-LABEL: @call_ceill(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @ceill(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.ceil.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_ceill to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @ceill(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     trunc                                   #
+;###############################################################
+declare float @truncf(float)
+declare double @trunc(double)
+declare x86_fp80 @truncl(x86_fp80)
+
+define float @call_truncf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_truncf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @truncf(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.trunc.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_truncf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @truncf(float 1.5)
+  ret float %r
+}
+
+define double @call_trunc() sanitize_numerical_stability {
+; DQQ-LABEL: @call_trunc(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @trunc(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_trunc to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_trunc(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @trunc(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_trunc to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @trunc(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_truncl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_truncl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @truncl(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.trunc.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_truncl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @truncl(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                      rint                                   #
+;###############################################################
+declare float @rintf(float)
+declare double @rint(double)
+declare x86_fp80 @rintl(x86_fp80)
+
+define float @call_rintf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_rintf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @rintf(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.rint.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_rintf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @rintf(float 1.5)
+  ret float %r
+}
+
+define double @call_rint() sanitize_numerical_stability {
+; DQQ-LABEL: @call_rint(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @rint(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_rint to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_rint(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @rint(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_rint to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @rint(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_rintl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_rintl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @rintl(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.rint.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_rintl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @rintl(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                  nearbyint                                  #
+;###############################################################
+declare float @nearbyintf(float)
+declare double @nearbyint(double)
+declare x86_fp80 @nearbyintl(x86_fp80)
+
+define float @call_nearbyintf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_nearbyintf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @nearbyintf(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.nearbyint.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_nearbyintf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @nearbyintf(float 1.5)
+  ret float %r
+}
+
+define double @call_nearbyint() sanitize_numerical_stability {
+; DQQ-LABEL: @call_nearbyint(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @nearbyint(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_nearbyint to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_nearbyint(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @nearbyint(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_nearbyint to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @nearbyint(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_nearbyintl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_nearbyintl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @nearbyintl(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.nearbyint.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_nearbyintl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @nearbyintl(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
+
+;###############################################################
+;#                     round                                   #
+;###############################################################
+declare float @roundf(float)
+declare double @round(double)
+declare x86_fp80 @roundl(x86_fp80)
+
+define float @call_roundf() sanitize_numerical_stability {
+; CHECK-LABEL: @call_roundf(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call float @roundf(float 1.500000e+00)
+; CHECK-NEXT:    [[TMP0:%.*]] = call double @llvm.round.f64(double 1.500000e+00)
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_float_d(float [[R]], double [[TMP0]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[R]] to double
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], double [[TMP3]], double [[TMP0]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_roundf to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store double [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 8
+; CHECK-NEXT:    ret float [[R]]
+;
+entry:
+  %r = call float @roundf(float 1.5)
+  ret float %r
+}
+
+define double @call_round() sanitize_numerical_stability {
+; DQQ-LABEL: @call_round(
+; DQQ-NEXT:  entry:
+; DQQ-NEXT:    [[R:%.*]] = call double @round(double 1.500000e+00)
+; DQQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+; DQQ-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; DQQ-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_double_q(double [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; DQQ-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; DQQ-NEXT:    [[TMP4:%.*]] = fpext double [[R]] to fp128
+; DQQ-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; DQQ-NEXT:    store i64 ptrtoint (ptr @call_round to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DQQ-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; DQQ-NEXT:    ret double [[R]]
+;
+; DLQ-LABEL: @call_round(
+; DLQ-NEXT:  entry:
+; DLQ-NEXT:    [[R:%.*]] = call double @round(double 1.500000e+00)
+; DLQ-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+; DLQ-NEXT:    [[TMP1:%.*]] = call i32 @__nsan_internal_check_double_l(double [[R]], x86_fp80 [[TMP0]], i32 1, i64 0)
+; DLQ-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 1
+; DLQ-NEXT:    [[TMP3:%.*]] = fpext double [[R]] to x86_fp80
+; DLQ-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], x86_fp80 [[TMP3]], x86_fp80 [[TMP0]]
+; DLQ-NEXT:    store i64 ptrtoint (ptr @call_round to i64), ptr @__nsan_shadow_ret_tag, align 8
+; DLQ-NEXT:    store x86_fp80 [[TMP4]], ptr @__nsan_shadow_ret_ptr, align 16
+; DLQ-NEXT:    ret double [[R]]
+;
+entry:
+  %r = call double @round(double 1.5)
+  ret double %r
+}
+
+define x86_fp80 @call_roundl() sanitize_numerical_stability {
+; CHECK-LABEL: @call_roundl(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[R:%.*]] = call x86_fp80 @roundl(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP0:%.*]] = call x86_fp80 @llvm.round.f80(x86_fp80 0xK3FFFC000000000000000)
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext x86_fp80 [[TMP0]] to fp128
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__nsan_internal_check_longdouble_q(x86_fp80 [[R]], fp128 [[TMP1]], i32 1, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = fpext x86_fp80 [[R]] to fp128
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP3]], fp128 [[TMP4]], fp128 [[TMP1]]
+; CHECK-NEXT:    store i64 ptrtoint (ptr @call_roundl to i64), ptr @__nsan_shadow_ret_tag, align 8
+; CHECK-NEXT:    store fp128 [[TMP5]], ptr @__nsan_shadow_ret_ptr, align 16
+; CHECK-NEXT:    ret x86_fp80 [[R]]
+;
+entry:
+  %r = call x86_fp80 @roundl(x86_fp80 0xK3FFFC000000000000000)
+  ret x86_fp80 %r
+}
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-entry-exit.ll b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-entry-exit.ll
new file mode 100644
index 0000000000000..7766674d2e19a
--- /dev/null
+++ b/llvm/test/Instrumentation/SanitizerCoverage/trace-pc-entry-exit.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=2 -sanitizer-coverage-trace-pc-entry-exit -S | FileCheck %s --check-prefix=CHECK
+; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=2 -sanitizer-coverage-trace-pc-entry-exit -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK,CHECK_BBCOV
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @callee_a(i32)
+declare i32 @callee_b()
+
+define i32 @func_multi_return(i32 noundef %a) #0 {
+entry:
+  %tobool.not = icmp eq i32 %a, 0
+  br i1 %tobool.not, label %if.else, label %if.then
+
+if.then:
+  %call = call i32 @callee_a(i32 noundef %a)
+  ret i32 %call
+
+if.else:
+  %call1 = call i32 @callee_b()
+  ret i32 %call1
+}
+
+; CHECK: define i32 @func_multi_return
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+; CHECK: call void @__sanitizer_cov_trace_pc_entry()
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+; CHECK: br
+; CHECK_BBCOV: call void @__sanitizer_cov_trace_pc()
+; CHECK: call i32 @callee_a
+; CHECK: notail call void @__sanitizer_cov_trace_pc_exit()
+; CHECK: ret
+; CHECK_BBCOV: call void @__sanitizer_cov_trace_pc()
+; CHECK: call i32 @callee_b
+; CHECK: notail call void @__sanitizer_cov_trace_pc_exit()
+; CHECK: ret
+; CHECK: }
+
+
+define i32 @func_musttail(i32 noundef %a) #0 {
+  %call = musttail call i32 @callee_a(i32 noundef %a)
+  ret i32 %call
+}
+
+; CHECK: define i32 @func_musttail
+; CHECK: call void @__sanitizer_cov_trace_pc_entry()
+; CHECK: notail call void @__sanitizer_cov_trace_pc_exit()
+; CHECK: musttail call i32 @callee_a
+; CHECK: ret
+; CHECK: }
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/MC/AArch64/cyclone-movi-bug.s b/llvm/test/MC/AArch64/cyclone-movi-bug.s
index d49aea3926910..def2c8fe0f678 100644
--- a/llvm/test/MC/AArch64/cyclone-movi-bug.s
+++ b/llvm/test/MC/AArch64/cyclone-movi-bug.s
@@ -1,8 +1,8 @@
 ; RUN: llvm-mc -triple aarch64-apple-ios -mcpu=cyclone %s 2> %t.log | FileCheck %s
 ; RUN: FileCheck %s --check-prefix=CHECK-ERR < %t.log
 
-    ; CHECK: movi v3.16b, #0
-    ; CHECK: movi v7.16b, #0
+    ; CHECK: movi.16b v3, #0
+    ; CHECK: movi.16b v7, #0
     ; CHECK-ERR: warning: instruction movi.2d with immediate #0 may not function correctly on this CPU, converting to equivalent movi.16b
     ; CHECK-ERR: warning: instruction movi.2d with immediate #0 may not function correctly on this CPU, converting to equivalent movi.16b
     movi.2d v3, #0
diff --git a/llvm/test/MC/AsmParser/directive_seh.s b/llvm/test/MC/AsmParser/directive_seh.s
index 072d76e357abe..6390a9a9cf9a0 100644
--- a/llvm/test/MC/AsmParser/directive_seh.s
+++ b/llvm/test/MC/AsmParser/directive_seh.s
@@ -2,7 +2,7 @@
 
 #   Round trip via intel syntax printing and back.
 # RUN: llvm-mc -triple x86_64-pc-win32 %s -output-asm-variant=1 | \
-# RUN:     llvm-mc -triple x86_64-pc-win32 -x86-asm-syntax=intel | FileCheck %s
+# RUN:     llvm-mc -triple x86_64-pc-win32 -x86-asm-syntax=intel --output-asm-variant=0 | FileCheck %s
 
     .text
     .globl func
diff --git a/llvm/test/MC/AsmParser/invalid-asm-variant.s b/llvm/test/MC/AsmParser/invalid-asm-variant.s
index a96bc9abb0305..f29a13431b78b 100644
--- a/llvm/test/MC/AsmParser/invalid-asm-variant.s
+++ b/llvm/test/MC/AsmParser/invalid-asm-variant.s
@@ -1,3 +1,3 @@
 //RUN:	not llvm-mc --disassemble -triple=x86_64 --output-asm-variant=2 %s -o - 2>&1 | FileCheck %s
 
-//CHECK: error: unable to create instruction printer for target triple 'x86_64' with assembly variant 2.
+//CHECK: error: unable to create instruction printer for target triple 'x86_64' with assembly variant 2
diff --git a/llvm/test/MC/Disassembler/AArch64/arm64-crypto.txt b/llvm/test/MC/Disassembler/AArch64/arm64-crypto.txt
index b905b92c636c2..0d4ba2d90532d 100644
--- a/llvm/test/MC/Disassembler/AArch64/arm64-crypto.txt
+++ b/llvm/test/MC/Disassembler/AArch64/arm64-crypto.txt
@@ -1,4 +1,4 @@
-# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto --output-asm-variant=0 --disassemble < %s | FileCheck %s
 # RUN: llvm-mc -triple arm64-apple-darwin -mattr=crypto -output-asm-variant=1 --disassemble < %s | FileCheck %s --check-prefix=CHECK-APPLE
 
   0x20 0x48 0x28 0x4e
diff --git a/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s b/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s
index 07bf680f48b1d..ce3b7b5e6d410 100644
--- a/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s
+++ b/llvm/test/MC/SystemZ/insn-good-zos-pcrel.s
@@ -1,5 +1,5 @@
 * For z10 and above.
-* RUN: llvm-mc -triple s390x-ibm-zos -show-encoding -emit-gnuas-syntax-on-zos=1 %s | FileCheck %s
+* RUN: llvm-mc -triple s390x-ibm-zos -show-encoding -emit-gnuas-syntax-on-zos=1 --output-asm-variant=0 %s | FileCheck %s
 
 *CHECK: brcl	0, FOO                  * encoding: [0xc0,0x04,A,A,A,A]
 *CHECK:  fixup A - offset: 2, value: FOO+2, kind: FK_390_PC32DBL
diff --git a/llvm/test/MC/X86/intel-syntax-32.s b/llvm/test/MC/X86/intel-syntax-32.s
index a503a256ce213..aca7511c6a834 100644
--- a/llvm/test/MC/X86/intel-syntax-32.s
+++ b/llvm/test/MC/X86/intel-syntax-32.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
+// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s
 
 // CHECK: leaw	(%bp,%si), %ax
 lea ax, [bp+si]
diff --git a/llvm/test/MC/X86/intel-syntax-branch.s b/llvm/test/MC/X86/intel-syntax-branch.s
index c8dcc9613cc1a..a5e5d31a3adc5 100644
--- a/llvm/test/MC/X86/intel-syntax-branch.s
+++ b/llvm/test/MC/X86/intel-syntax-branch.s
@@ -1,7 +1,7 @@
-// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s --check-prefixes=CHECK-32,CHECK
-// RUN: llvm-mc -triple x86_64-unknown-unknown --defsym X64=1 -x86-asm-syntax=intel %s | FileCheck %s --check-prefixes=CHECK-64,CHECK
+// RUN: llvm-mc -triple i686-unknown-unknown -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s --check-prefixes=CHECK-32,CHECK
+// RUN: llvm-mc -triple x86_64-unknown-unknown --defsym X64=1 -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s --check-prefixes=CHECK-64,CHECK
 
-// RUN: not llvm-mc -triple i686-unknown-unknown --defsym ERR=1 -x86-asm-syntax=intel %s 2>&1 | FileCheck %s --check-prefixes=ERR-32
+// RUN: not llvm-mc -triple i686-unknown-unknown --defsym ERR=1 -x86-asm-syntax=intel --output-asm-variant=0 %s 2>&1 | FileCheck %s --check-prefixes=ERR-32
 
 t0:
 call direct_branch
diff --git a/llvm/test/MC/X86/intel-syntax-directional-label.s b/llvm/test/MC/X86/intel-syntax-directional-label.s
index c1aa90f1923bf..4dbeee1d9f9d3 100644
--- a/llvm/test/MC/X86/intel-syntax-directional-label.s
+++ b/llvm/test/MC/X86/intel-syntax-directional-label.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-apple-darwin -x86-asm-syntax=intel %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-apple-darwin -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s
 // rdar://14961158
 	.text
 	.align 16
diff --git a/llvm/test/MC/X86/intel-syntax-hex.s b/llvm/test/MC/X86/intel-syntax-hex.s
index cb73ca9f5017b..64c95a60c54e4 100644
--- a/llvm/test/MC/X86/intel-syntax-hex.s
+++ b/llvm/test/MC/X86/intel-syntax-hex.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -masm-integers -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
+// RUN: llvm-mc -masm-integers -triple x86_64-unknown-unknown -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s
 // rdar://12470373
 
 // Checks to make sure we parse the hexadecimal suffix properly.
diff --git a/llvm/test/MC/X86/intel-syntax-movabs-large.s b/llvm/test/MC/X86/intel-syntax-movabs-large.s
index eb4353dbaab17..5e9582c5995c7 100644
--- a/llvm/test/MC/X86/intel-syntax-movabs-large.s
+++ b/llvm/test/MC/X86/intel-syntax-movabs-large.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64- -x86-asm-syntax=intel --show-encoding %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64- -x86-asm-syntax=intel --output-asm-variant=0 --show-encoding %s | FileCheck %s
 
 // These should map mov -> movabs
 
diff --git a/llvm/test/MC/X86/intel-syntax-unsized-memory.s b/llvm/test/MC/X86/intel-syntax-unsized-memory.s
index c938d76498eed..1ec36910d4cb0 100644
--- a/llvm/test/MC/X86/intel-syntax-unsized-memory.s
+++ b/llvm/test/MC/X86/intel-syntax-unsized-memory.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s
 
 // Check that we deduce unsized memory operands in the general, unambiguous, case.
 // We can't deduce xword memory operands, because there is no instruction
diff --git a/llvm/test/MC/X86/intel-syntax.s b/llvm/test/MC/X86/intel-syntax.s
index 13616a00b249e..024af5db6106d 100644
--- a/llvm/test/MC/X86/intel-syntax.s
+++ b/llvm/test/MC/X86/intel-syntax.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s > %t 2> %t.err
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel --output-asm-variant=0 %s > %t 2> %t.err
 // RUN: FileCheck < %t %s
 // RUN: FileCheck --check-prefix=CHECK-STDERR < %t.err %s
 
diff --git a/llvm/test/MC/X86/pr32530.s b/llvm/test/MC/X86/pr32530.s
index 328c329a418a9..e436df55589d9 100644
--- a/llvm/test/MC/X86/pr32530.s
+++ b/llvm/test/MC/X86/pr32530.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s | FileCheck %s
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s
 
 .text
 // CHECK: movq    $msg, %rsi
diff --git a/llvm/test/MC/X86/x86-32-ms-inline-asm.s b/llvm/test/MC/X86/x86-32-ms-inline-asm.s
index 3169033f2ffcf..5cce75a40e4ea 100644
--- a/llvm/test/MC/X86/x86-32-ms-inline-asm.s
+++ b/llvm/test/MC/X86/x86-32-ms-inline-asm.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -x86-asm-syntax=intel -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+// RUN: llvm-mc -x86-asm-syntax=intel --output-asm-variant=0 -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
 
 mov eax, [ebx].0
 mov [ebx].4, ecx
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 1ddffacc322f2..b99b6ef60eac7 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -10753,3 +10753,450 @@ static const X86FoldTableEntry BroadcastTable4[] = {
   {X86::VXORPSZrrk, X86::VXORPSZrmbk, TB_BCAST_SS},
 };
 
+static const unsigned NonFoldableWithSameMaskTable[] = {
+  X86::VALIGNDZ128rrik,
+  X86::VALIGNDZ128rrikz,
+  X86::VALIGNDZ256rrik,
+  X86::VALIGNDZ256rrikz,
+  X86::VALIGNDZrrik,
+  X86::VALIGNDZrrikz,
+  X86::VALIGNQZ128rrik,
+  X86::VALIGNQZ128rrikz,
+  X86::VALIGNQZ256rrik,
+  X86::VALIGNQZ256rrikz,
+  X86::VALIGNQZrrik,
+  X86::VALIGNQZrrikz,
+  X86::VBROADCASTF32X2Z256rrk,
+  X86::VBROADCASTF32X2Z256rrkz,
+  X86::VBROADCASTF32X2Zrrk,
+  X86::VBROADCASTF32X2Zrrkz,
+  X86::VBROADCASTI32X2Z128rrk,
+  X86::VBROADCASTI32X2Z128rrkz,
+  X86::VBROADCASTI32X2Z256rrk,
+  X86::VBROADCASTI32X2Z256rrkz,
+  X86::VBROADCASTI32X2Zrrk,
+  X86::VBROADCASTI32X2Zrrkz,
+  X86::VBROADCASTSDZ256rrk,
+  X86::VBROADCASTSDZ256rrkz,
+  X86::VBROADCASTSDZrrk,
+  X86::VBROADCASTSDZrrkz,
+  X86::VBROADCASTSSZ128rrk,
+  X86::VBROADCASTSSZ128rrkz,
+  X86::VBROADCASTSSZ256rrk,
+  X86::VBROADCASTSSZ256rrkz,
+  X86::VBROADCASTSSZrrk,
+  X86::VBROADCASTSSZrrkz,
+  X86::VDBPSADBWZ128rrik,
+  X86::VDBPSADBWZ128rrikz,
+  X86::VDBPSADBWZ256rrik,
+  X86::VDBPSADBWZ256rrikz,
+  X86::VDBPSADBWZrrik,
+  X86::VDBPSADBWZrrikz,
+  X86::VEXPANDPDZ128rrk,
+  X86::VEXPANDPDZ128rrkz,
+  X86::VEXPANDPDZ256rrk,
+  X86::VEXPANDPDZ256rrkz,
+  X86::VEXPANDPDZrrk,
+  X86::VEXPANDPDZrrkz,
+  X86::VEXPANDPSZ128rrk,
+  X86::VEXPANDPSZ128rrkz,
+  X86::VEXPANDPSZ256rrk,
+  X86::VEXPANDPSZ256rrkz,
+  X86::VEXPANDPSZrrk,
+  X86::VEXPANDPSZrrkz,
+  X86::VGF2P8AFFINEINVQBZ128rrik,
+  X86::VGF2P8AFFINEINVQBZ128rrikz,
+  X86::VGF2P8AFFINEINVQBZ256rrik,
+  X86::VGF2P8AFFINEINVQBZ256rrikz,
+  X86::VGF2P8AFFINEINVQBZrrik,
+  X86::VGF2P8AFFINEINVQBZrrikz,
+  X86::VGF2P8AFFINEQBZ128rrik,
+  X86::VGF2P8AFFINEQBZ128rrikz,
+  X86::VGF2P8AFFINEQBZ256rrik,
+  X86::VGF2P8AFFINEQBZ256rrikz,
+  X86::VGF2P8AFFINEQBZrrik,
+  X86::VGF2P8AFFINEQBZrrikz,
+  X86::VINSERTF32X4Z256rrik,
+  X86::VINSERTF32X4Z256rrikz,
+  X86::VINSERTF32X4Zrrik,
+  X86::VINSERTF32X4Zrrikz,
+  X86::VINSERTF32X8Zrrik,
+  X86::VINSERTF32X8Zrrikz,
+  X86::VINSERTF64X2Z256rrik,
+  X86::VINSERTF64X2Z256rrikz,
+  X86::VINSERTF64X2Zrrik,
+  X86::VINSERTF64X2Zrrikz,
+  X86::VINSERTF64X4Zrrik,
+  X86::VINSERTF64X4Zrrikz,
+  X86::VINSERTI32X4Z256rrik,
+  X86::VINSERTI32X4Z256rrikz,
+  X86::VINSERTI32X4Zrrik,
+  X86::VINSERTI32X4Zrrikz,
+  X86::VINSERTI32X8Zrrik,
+  X86::VINSERTI32X8Zrrikz,
+  X86::VINSERTI64X2Z256rrik,
+  X86::VINSERTI64X2Z256rrikz,
+  X86::VINSERTI64X2Zrrik,
+  X86::VINSERTI64X2Zrrikz,
+  X86::VINSERTI64X4Zrrik,
+  X86::VINSERTI64X4Zrrikz,
+  X86::VMOVDDUPZ128rrk,
+  X86::VMOVDDUPZ128rrkz,
+  X86::VMOVDDUPZ256rrk,
+  X86::VMOVDDUPZ256rrkz,
+  X86::VMOVDDUPZrrk,
+  X86::VMOVDDUPZrrkz,
+  X86::VMOVSHDUPZ128rrk,
+  X86::VMOVSHDUPZ128rrkz,
+  X86::VMOVSHDUPZ256rrk,
+  X86::VMOVSHDUPZ256rrkz,
+  X86::VMOVSHDUPZrrk,
+  X86::VMOVSHDUPZrrkz,
+  X86::VMOVSLDUPZ128rrk,
+  X86::VMOVSLDUPZ128rrkz,
+  X86::VMOVSLDUPZ256rrk,
+  X86::VMOVSLDUPZ256rrkz,
+  X86::VMOVSLDUPZrrk,
+  X86::VMOVSLDUPZrrkz,
+  X86::VMPSADBWZ128rrik,
+  X86::VMPSADBWZ128rrikz,
+  X86::VMPSADBWZ256rrik,
+  X86::VMPSADBWZ256rrikz,
+  X86::VMPSADBWZrrik,
+  X86::VMPSADBWZrrikz,
+  X86::VPACKSSDWZ128rrk,
+  X86::VPACKSSDWZ128rrkz,
+  X86::VPACKSSDWZ256rrk,
+  X86::VPACKSSDWZ256rrkz,
+  X86::VPACKSSDWZrrk,
+  X86::VPACKSSDWZrrkz,
+  X86::VPACKSSWBZ128rrk,
+  X86::VPACKSSWBZ128rrkz,
+  X86::VPACKSSWBZ256rrk,
+  X86::VPACKSSWBZ256rrkz,
+  X86::VPACKSSWBZrrk,
+  X86::VPACKSSWBZrrkz,
+  X86::VPACKUSDWZ128rrk,
+  X86::VPACKUSDWZ128rrkz,
+  X86::VPACKUSDWZ256rrk,
+  X86::VPACKUSDWZ256rrkz,
+  X86::VPACKUSDWZrrk,
+  X86::VPACKUSDWZrrkz,
+  X86::VPACKUSWBZ128rrk,
+  X86::VPACKUSWBZ128rrkz,
+  X86::VPACKUSWBZ256rrk,
+  X86::VPACKUSWBZ256rrkz,
+  X86::VPACKUSWBZrrk,
+  X86::VPACKUSWBZrrkz,
+  X86::VPALIGNRZ128rrik,
+  X86::VPALIGNRZ128rrikz,
+  X86::VPALIGNRZ256rrik,
+  X86::VPALIGNRZ256rrikz,
+  X86::VPALIGNRZrrik,
+  X86::VPALIGNRZrrikz,
+  X86::VPBROADCASTBZ128rrk,
+  X86::VPBROADCASTBZ128rrkz,
+  X86::VPBROADCASTBZ256rrk,
+  X86::VPBROADCASTBZ256rrkz,
+  X86::VPBROADCASTBZrrk,
+  X86::VPBROADCASTBZrrkz,
+  X86::VPBROADCASTDZ128rrk,
+  X86::VPBROADCASTDZ128rrkz,
+  X86::VPBROADCASTDZ256rrk,
+  X86::VPBROADCASTDZ256rrkz,
+  X86::VPBROADCASTDZrrk,
+  X86::VPBROADCASTDZrrkz,
+  X86::VPBROADCASTQZ128rrk,
+  X86::VPBROADCASTQZ128rrkz,
+  X86::VPBROADCASTQZ256rrk,
+  X86::VPBROADCASTQZ256rrkz,
+  X86::VPBROADCASTQZrrk,
+  X86::VPBROADCASTQZrrkz,
+  X86::VPBROADCASTWZ128rrk,
+  X86::VPBROADCASTWZ128rrkz,
+  X86::VPBROADCASTWZ256rrk,
+  X86::VPBROADCASTWZ256rrkz,
+  X86::VPBROADCASTWZrrk,
+  X86::VPBROADCASTWZrrkz,
+  X86::VPCONFLICTDZ128rrk,
+  X86::VPCONFLICTDZ128rrkz,
+  X86::VPCONFLICTDZ256rrk,
+  X86::VPCONFLICTDZ256rrkz,
+  X86::VPCONFLICTDZrrk,
+  X86::VPCONFLICTDZrrkz,
+  X86::VPCONFLICTQZ128rrk,
+  X86::VPCONFLICTQZ128rrkz,
+  X86::VPCONFLICTQZ256rrk,
+  X86::VPCONFLICTQZ256rrkz,
+  X86::VPCONFLICTQZrrk,
+  X86::VPCONFLICTQZrrkz,
+  X86::VPERMBZ128rrk,
+  X86::VPERMBZ128rrkz,
+  X86::VPERMBZ256rrk,
+  X86::VPERMBZ256rrkz,
+  X86::VPERMBZrrk,
+  X86::VPERMBZrrkz,
+  X86::VPERMDZ256rrk,
+  X86::VPERMDZ256rrkz,
+  X86::VPERMDZrrk,
+  X86::VPERMDZrrkz,
+  X86::VPERMI2BZ128rrk,
+  X86::VPERMI2BZ128rrkz,
+  X86::VPERMI2BZ256rrk,
+  X86::VPERMI2BZ256rrkz,
+  X86::VPERMI2BZrrk,
+  X86::VPERMI2BZrrkz,
+  X86::VPERMI2DZ128rrk,
+  X86::VPERMI2DZ128rrkz,
+  X86::VPERMI2DZ256rrk,
+  X86::VPERMI2DZ256rrkz,
+  X86::VPERMI2DZrrk,
+  X86::VPERMI2DZrrkz,
+  X86::VPERMI2PDZ128rrk,
+  X86::VPERMI2PDZ128rrkz,
+  X86::VPERMI2PDZ256rrk,
+  X86::VPERMI2PDZ256rrkz,
+  X86::VPERMI2PDZrrk,
+  X86::VPERMI2PDZrrkz,
+  X86::VPERMI2PSZ128rrk,
+  X86::VPERMI2PSZ128rrkz,
+  X86::VPERMI2PSZ256rrk,
+  X86::VPERMI2PSZ256rrkz,
+  X86::VPERMI2PSZrrk,
+  X86::VPERMI2PSZrrkz,
+  X86::VPERMI2QZ128rrk,
+  X86::VPERMI2QZ128rrkz,
+  X86::VPERMI2QZ256rrk,
+  X86::VPERMI2QZ256rrkz,
+  X86::VPERMI2QZrrk,
+  X86::VPERMI2QZrrkz,
+  X86::VPERMI2WZ128rrk,
+  X86::VPERMI2WZ128rrkz,
+  X86::VPERMI2WZ256rrk,
+  X86::VPERMI2WZ256rrkz,
+  X86::VPERMI2WZrrk,
+  X86::VPERMI2WZrrkz,
+  X86::VPERMILPDZ128rik,
+  X86::VPERMILPDZ128rikz,
+  X86::VPERMILPDZ256rik,
+  X86::VPERMILPDZ256rikz,
+  X86::VPERMILPDZrik,
+  X86::VPERMILPDZrikz,
+  X86::VPERMILPSZ128rik,
+  X86::VPERMILPSZ128rikz,
+  X86::VPERMILPSZ256rik,
+  X86::VPERMILPSZ256rikz,
+  X86::VPERMILPSZrik,
+  X86::VPERMILPSZrikz,
+  X86::VPERMPDZ256rik,
+  X86::VPERMPDZ256rikz,
+  X86::VPERMPDZ256rrk,
+  X86::VPERMPDZ256rrkz,
+  X86::VPERMPDZrik,
+  X86::VPERMPDZrikz,
+  X86::VPERMPDZrrk,
+  X86::VPERMPDZrrkz,
+  X86::VPERMPSZ256rrk,
+  X86::VPERMPSZ256rrkz,
+  X86::VPERMPSZrrk,
+  X86::VPERMPSZrrkz,
+  X86::VPERMQZ256rik,
+  X86::VPERMQZ256rikz,
+  X86::VPERMQZ256rrk,
+  X86::VPERMQZ256rrkz,
+  X86::VPERMQZrik,
+  X86::VPERMQZrikz,
+  X86::VPERMQZrrk,
+  X86::VPERMQZrrkz,
+  X86::VPERMT2BZ128rrk,
+  X86::VPERMT2BZ128rrkz,
+  X86::VPERMT2BZ256rrk,
+  X86::VPERMT2BZ256rrkz,
+  X86::VPERMT2BZrrk,
+  X86::VPERMT2BZrrkz,
+  X86::VPERMT2DZ128rrk,
+  X86::VPERMT2DZ128rrkz,
+  X86::VPERMT2DZ256rrk,
+  X86::VPERMT2DZ256rrkz,
+  X86::VPERMT2DZrrk,
+  X86::VPERMT2DZrrkz,
+  X86::VPERMT2PDZ128rrk,
+  X86::VPERMT2PDZ128rrkz,
+  X86::VPERMT2PDZ256rrk,
+  X86::VPERMT2PDZ256rrkz,
+  X86::VPERMT2PDZrrk,
+  X86::VPERMT2PDZrrkz,
+  X86::VPERMT2PSZ128rrk,
+  X86::VPERMT2PSZ128rrkz,
+  X86::VPERMT2PSZ256rrk,
+  X86::VPERMT2PSZ256rrkz,
+  X86::VPERMT2PSZrrk,
+  X86::VPERMT2PSZrrkz,
+  X86::VPERMT2QZ128rrk,
+  X86::VPERMT2QZ128rrkz,
+  X86::VPERMT2QZ256rrk,
+  X86::VPERMT2QZ256rrkz,
+  X86::VPERMT2QZrrk,
+  X86::VPERMT2QZrrkz,
+  X86::VPERMT2WZ128rrk,
+  X86::VPERMT2WZ128rrkz,
+  X86::VPERMT2WZ256rrk,
+  X86::VPERMT2WZ256rrkz,
+  X86::VPERMT2WZrrk,
+  X86::VPERMT2WZrrkz,
+  X86::VPERMWZ128rrk,
+  X86::VPERMWZ128rrkz,
+  X86::VPERMWZ256rrk,
+  X86::VPERMWZ256rrkz,
+  X86::VPERMWZrrk,
+  X86::VPERMWZrrkz,
+  X86::VPEXPANDBZ128rrk,
+  X86::VPEXPANDBZ128rrkz,
+  X86::VPEXPANDBZ256rrk,
+  X86::VPEXPANDBZ256rrkz,
+  X86::VPEXPANDBZrrk,
+  X86::VPEXPANDBZrrkz,
+  X86::VPEXPANDDZ128rrk,
+  X86::VPEXPANDDZ128rrkz,
+  X86::VPEXPANDDZ256rrk,
+  X86::VPEXPANDDZ256rrkz,
+  X86::VPEXPANDDZrrk,
+  X86::VPEXPANDDZrrkz,
+  X86::VPEXPANDQZ128rrk,
+  X86::VPEXPANDQZ128rrkz,
+  X86::VPEXPANDQZ256rrk,
+  X86::VPEXPANDQZ256rrkz,
+  X86::VPEXPANDQZrrk,
+  X86::VPEXPANDQZrrkz,
+  X86::VPEXPANDWZ128rrk,
+  X86::VPEXPANDWZ128rrkz,
+  X86::VPEXPANDWZ256rrk,
+  X86::VPEXPANDWZ256rrkz,
+  X86::VPEXPANDWZrrk,
+  X86::VPEXPANDWZrrkz,
+  X86::VPMULTISHIFTQBZ128rrk,
+  X86::VPMULTISHIFTQBZ128rrkz,
+  X86::VPMULTISHIFTQBZ256rrk,
+  X86::VPMULTISHIFTQBZ256rrkz,
+  X86::VPMULTISHIFTQBZrrk,
+  X86::VPMULTISHIFTQBZrrkz,
+  X86::VPSHUFDZ128rik,
+  X86::VPSHUFDZ128rikz,
+  X86::VPSHUFDZ256rik,
+  X86::VPSHUFDZ256rikz,
+  X86::VPSHUFDZrik,
+  X86::VPSHUFDZrikz,
+  X86::VPSHUFHWZ128rik,
+  X86::VPSHUFHWZ128rikz,
+  X86::VPSHUFHWZ256rik,
+  X86::VPSHUFHWZ256rikz,
+  X86::VPSHUFHWZrik,
+  X86::VPSHUFHWZrikz,
+  X86::VPSHUFLWZ128rik,
+  X86::VPSHUFLWZ128rikz,
+  X86::VPSHUFLWZ256rik,
+  X86::VPSHUFLWZ256rikz,
+  X86::VPSHUFLWZrik,
+  X86::VPSHUFLWZrikz,
+  X86::VPUNPCKHBWZ128rrk,
+  X86::VPUNPCKHBWZ128rrkz,
+  X86::VPUNPCKHBWZ256rrk,
+  X86::VPUNPCKHBWZ256rrkz,
+  X86::VPUNPCKHBWZrrk,
+  X86::VPUNPCKHBWZrrkz,
+  X86::VPUNPCKHDQZ128rrk,
+  X86::VPUNPCKHDQZ128rrkz,
+  X86::VPUNPCKHDQZ256rrk,
+  X86::VPUNPCKHDQZ256rrkz,
+  X86::VPUNPCKHDQZrrk,
+  X86::VPUNPCKHDQZrrkz,
+  X86::VPUNPCKHQDQZ128rrk,
+  X86::VPUNPCKHQDQZ128rrkz,
+  X86::VPUNPCKHQDQZ256rrk,
+  X86::VPUNPCKHQDQZ256rrkz,
+  X86::VPUNPCKHQDQZrrk,
+  X86::VPUNPCKHQDQZrrkz,
+  X86::VPUNPCKHWDZ128rrk,
+  X86::VPUNPCKHWDZ128rrkz,
+  X86::VPUNPCKHWDZ256rrk,
+  X86::VPUNPCKHWDZ256rrkz,
+  X86::VPUNPCKHWDZrrk,
+  X86::VPUNPCKHWDZrrkz,
+  X86::VPUNPCKLBWZ128rrk,
+  X86::VPUNPCKLBWZ128rrkz,
+  X86::VPUNPCKLBWZ256rrk,
+  X86::VPUNPCKLBWZ256rrkz,
+  X86::VPUNPCKLBWZrrk,
+  X86::VPUNPCKLBWZrrkz,
+  X86::VPUNPCKLDQZ128rrk,
+  X86::VPUNPCKLDQZ128rrkz,
+  X86::VPUNPCKLDQZ256rrk,
+  X86::VPUNPCKLDQZ256rrkz,
+  X86::VPUNPCKLDQZrrk,
+  X86::VPUNPCKLDQZrrkz,
+  X86::VPUNPCKLQDQZ128rrk,
+  X86::VPUNPCKLQDQZ128rrkz,
+  X86::VPUNPCKLQDQZ256rrk,
+  X86::VPUNPCKLQDQZ256rrkz,
+  X86::VPUNPCKLQDQZrrk,
+  X86::VPUNPCKLQDQZrrkz,
+  X86::VPUNPCKLWDZ128rrk,
+  X86::VPUNPCKLWDZ128rrkz,
+  X86::VPUNPCKLWDZ256rrk,
+  X86::VPUNPCKLWDZ256rrkz,
+  X86::VPUNPCKLWDZrrk,
+  X86::VPUNPCKLWDZrrkz,
+  X86::VSHUFF32X4Z256rrik,
+  X86::VSHUFF32X4Z256rrikz,
+  X86::VSHUFF32X4Zrrik,
+  X86::VSHUFF32X4Zrrikz,
+  X86::VSHUFF64X2Z256rrik,
+  X86::VSHUFF64X2Z256rrikz,
+  X86::VSHUFF64X2Zrrik,
+  X86::VSHUFF64X2Zrrikz,
+  X86::VSHUFI32X4Z256rrik,
+  X86::VSHUFI32X4Z256rrikz,
+  X86::VSHUFI32X4Zrrik,
+  X86::VSHUFI32X4Zrrikz,
+  X86::VSHUFI64X2Z256rrik,
+  X86::VSHUFI64X2Z256rrikz,
+  X86::VSHUFI64X2Zrrik,
+  X86::VSHUFI64X2Zrrikz,
+  X86::VSHUFPDZ128rrik,
+  X86::VSHUFPDZ128rrikz,
+  X86::VSHUFPDZ256rrik,
+  X86::VSHUFPDZ256rrikz,
+  X86::VSHUFPDZrrik,
+  X86::VSHUFPDZrrikz,
+  X86::VSHUFPSZ128rrik,
+  X86::VSHUFPSZ128rrikz,
+  X86::VSHUFPSZ256rrik,
+  X86::VSHUFPSZ256rrikz,
+  X86::VSHUFPSZrrik,
+  X86::VSHUFPSZrrikz,
+  X86::VUNPCKHPDZ128rrk,
+  X86::VUNPCKHPDZ128rrkz,
+  X86::VUNPCKHPDZ256rrk,
+  X86::VUNPCKHPDZ256rrkz,
+  X86::VUNPCKHPDZrrk,
+  X86::VUNPCKHPDZrrkz,
+  X86::VUNPCKHPSZ128rrk,
+  X86::VUNPCKHPSZ128rrkz,
+  X86::VUNPCKHPSZ256rrk,
+  X86::VUNPCKHPSZ256rrkz,
+  X86::VUNPCKHPSZrrk,
+  X86::VUNPCKHPSZrrkz,
+  X86::VUNPCKLPDZ128rrk,
+  X86::VUNPCKLPDZ128rrkz,
+  X86::VUNPCKLPDZ256rrk,
+  X86::VUNPCKLPDZ256rrkz,
+  X86::VUNPCKLPDZrrk,
+  X86::VUNPCKLPDZrrkz,
+  X86::VUNPCKLPSZ128rrk,
+  X86::VUNPCKLPSZ128rrkz,
+  X86::VUNPCKLPSZ256rrk,
+  X86::VUNPCKLPSZ256rrkz,
+  X86::VUNPCKLPSZrrk,
+  X86::VUNPCKLPSZrrkz,
+};
+
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-log2-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-log2-basics.ll
new file mode 100644
index 0000000000000..4585af5dcf314
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/lower-table-based-log2-basics.ll
@@ -0,0 +1,206 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=aarch64 -S < %s | FileCheck %s
+
+;; These cases test lowering of various implementations of table-based log2
+;; algorithms to the llvm.ctlz instruction.
+
+;; C reproducers:
+;; int log2(unsigned v) {
+;;   static const unsigned char table[] = {
+;;     0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
+;;     8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;
+;;   return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+;; }
+;; int log2_64(unsigned long long v) {
+;;   static const unsigned char table[] = {
+;;      0, 47,  1, 56, 48, 27,  2, 60, 57, 49, 41, 37, 28, 16,  3, 61,
+;;     54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11,  4, 62,
+;;     46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
+;;     25, 39, 14, 33, 19, 30,  9, 24, 13, 18,  8, 12,  7,  6,  5, 63
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;   v |= v >> 32;
+;;
+;;   return table[(v * 0x03F79D71B4CB0A89ULL) >> 58];
+;; }
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+define i32 @log2_32(i32 %v) !prof !0 {
+; CHECK-LABEL: @log2_32(
+; CHECK: !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_nusw(i32 %v) {
+; CHECK-LABEL: @log2_32_nusw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr nusw [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_64.table = internal unnamed_addr constant [64 x i8] c"\00/\0180\1B\02<91)%\1C\10\03=6:#42*\15,& \1D\17\11\0B\04>.7\1A;($\0F5\223\14+\1F\16\0A-\19'\0E!\13\1E\09\18\0D\12\08\0C\07\06\05?", align 1
+
+define i32 @log2_64(i64 noundef %v) {
+; CHECK-LABEL: @log2_64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 63, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i64 %v, 1
+  %or = or i64 %shr, %v
+  %shr1 = lshr i64 %or, 2
+  %or2 = or i64 %shr1, %or
+  %shr3 = lshr i64 %or2, 4
+  %or4 = or i64 %shr3, %or2
+  %shr5 = lshr i64 %or4, 8
+  %or6 = or i64 %shr5, %or4
+  %shr7 = lshr i64 %or6, 16
+  %or8 = or i64 %shr7, %or6
+  %shr9 = lshr i64 %or8, 32
+  %or10 = or i64 %shr9, %or8
+  %mul = mul i64 %or10, 285870213051386505
+  %shr11 = lshr i64 %mul, 58
+  %arrayidx = getelementptr inbounds i8, ptr @log2_64.table, i64 %shr11
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_16.table = internal unnamed_addr constant [16 x i8] c"\00\07\01\0D\08\0A\02\0E\06\0C\09\05\0B\04\03\0F", align 1
+
+define i32 @log2_16(i16 noundef %0) {
+; CHECK-LABEL: @log2_16(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[TMP0:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i16 15, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i16 0, i16 [[TMP3]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i16 [[TMP5]] to i8
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    ret i32 [[TMP15]]
+;
+  %2 = lshr i16 %0, 1
+  %3 = or i16 %2, %0
+  %4 = lshr i16 %3, 2
+  %5 = or i16 %4, %3
+  %6 = lshr i16 %5, 4
+  %7 = or i16 %6, %5
+  %8 = lshr i16 %7, 8
+  %9 = or i16 %8, %7
+  %10 = mul i16 %9, 3885
+  %11 = lshr i16 %10, 12
+  %12 = zext nneg i16 %11 to i64
+  %13 = getelementptr inbounds nuw i8, ptr @log2_16.table, i64 %12
+  %14 = load i8, ptr %13, align 1
+  %15 = zext i8 %14 to i32
+  ret i32 %15
+}
+
+
+ at log2_128.table = internal unnamed_addr constant [128 x i8] c"\00\0D\01\0E\13&\02\0F\1A\14A!'H\03\10\1E\1B9\15.B[\225(c<Ij\04}\11\18\1F\1C,3:\161/NCPU\\#E6g)Rdp=WJs^kw\05~\0C\12%\19@ G\1D8-Z4b;i|\17+20MOTDfQoVr]v\0B$?F7Yah{*LSenqu\0A>X`zKmt\09_yl\08x\07\06\7F", align 1
+
+define i32 @log2_128(i128 noundef %0) {
+; CHECK-LABEL: @log2_128(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i128 @llvm.ctlz.i128(i128 [[TMP0:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i128 127, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i128 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i128 0, i128 [[TMP3]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP20:%.*]] = trunc i128 [[TMP5]] to i8
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:    ret i32 [[TMP21]]
+;
+  %2 = lshr i128 %0, 1
+  %3 = or i128 %2, %0
+  %4 = lshr i128 %3, 2
+  %5 = or i128 %4, %3
+  %6 = lshr i128 %5, 4
+  %7 = or i128 %6, %5
+  %8 = lshr i128 %7, 8
+  %9 = or i128 %8, %7
+  %10 = lshr i128 %9, 16
+  %11 = or i128 %10, %9
+  %12 = lshr i128 %11, 32
+  %13 = or i128 %12, %11
+  %14 = lshr i128 %13, 64
+  %15 = or i128 %14, %13
+  %16 = mul i128 %15, 2638024179347461332462726661865453437
+  %17 = lshr i128 %16, 121
+  %18 = trunc nuw nsw i128 %17 to i64
+  %19 = getelementptr inbounds nuw i8, ptr @log2_128.table, i64 %18
+  %20 = load i8, ptr %19, align 1
+  %21 = zext i8 %20 to i32
+  ret i32 %21
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
new file mode 100644
index 0000000000000..2b340faab1300
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-basics.ll
@@ -0,0 +1,220 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+;; These cases test lowering of various implementations of table-based log2
+;; algorithms to the llvm.ctlz instruction.
+
+;; C reproducers:
+;; int log2(unsigned v) {
+;;   static const unsigned char table[] = {
+;;     0,  9,  1, 10, 13, 21,  2, 29, 11, 14, 16, 18, 22, 25,  3, 30,
+;;     8, 12, 20, 28, 15, 17, 24,  7, 19, 27, 23,  6, 26,  5,  4, 31
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;
+;;   return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
+;; }
+;; int log2_64(unsigned long long v) {
+;;   static const unsigned char table[] = {
+;;      0, 47,  1, 56, 48, 27,  2, 60, 57, 49, 41, 37, 28, 16,  3, 61,
+;;     54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11,  4, 62,
+;;     46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
+;;     25, 39, 14, 33, 19, 30,  9, 24, 13, 18,  8, 12,  7,  6,  5, 63
+;;   };
+;;
+;;   v |= v >> 1;
+;;   v |= v >> 2;
+;;   v |= v >> 4;
+;;   v |= v >> 8;
+;;   v |= v >> 16;
+;;   v |= v >> 32;
+;;
+;;   return table[(v * 0x03F79D71B4CB0A89ULL) >> 58];
+;; }
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+define i32 @log2_32(i32 %v) !prof !0 {
+; CHECK-LABEL: @log2_32(
+; CHECK: !prof [[PROF0:![0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF1:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_nusw(i32 %v) {
+; CHECK-LABEL: @log2_32_nusw(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 31, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i32 0, i32 [[TMP1]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr nusw [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_64.table = internal unnamed_addr constant [64 x i8] c"\00/\0180\1B\02<91)%\1C\10\03=6:#42*\15,& \1D\17\11\0B\04>.7\1A;($\0F5\223\14+\1F\16\0A-\19'\0E!\13\1E\09\18\0D\12\08\0C\07\06\05?", align 1
+
+define i32 @log2_64(i64 noundef %v) {
+; CHECK-LABEL: @log2_64(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[V:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 63, [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[V]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 0, i64 [[TMP1]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i8
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i64 %v, 1
+  %or = or i64 %shr, %v
+  %shr1 = lshr i64 %or, 2
+  %or2 = or i64 %shr1, %or
+  %shr3 = lshr i64 %or2, 4
+  %or4 = or i64 %shr3, %or2
+  %shr5 = lshr i64 %or4, 8
+  %or6 = or i64 %shr5, %or4
+  %shr7 = lshr i64 %or6, 16
+  %or8 = or i64 %shr7, %or6
+  %shr9 = lshr i64 %or8, 32
+  %or10 = or i64 %shr9, %or8
+  %mul = mul i64 %or10, 285870213051386505
+  %shr11 = lshr i64 %mul, 58
+  %arrayidx = getelementptr inbounds i8, ptr @log2_64.table, i64 %shr11
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_16.table = internal unnamed_addr constant [16 x i8] c"\00\07\01\0D\08\0A\02\0E\06\0C\09\05\0B\04\03\0F", align 1
+
+define i32 @log2_16(i16 noundef %0) {
+; CHECK-LABEL: @log2_16(
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.ctlz.i16(i16 [[TMP0:%.*]], i1 true)
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i16 15, [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i16 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i16 0, i16 [[TMP3]], !prof [[PROF1]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc i16 [[TMP5]] to i8
+; CHECK-NEXT:    [[TMP15:%.*]] = zext i8 [[TMP14]] to i32
+; CHECK-NEXT:    ret i32 [[TMP15]]
+;
+  %2 = lshr i16 %0, 1
+  %3 = or i16 %2, %0
+  %4 = lshr i16 %3, 2
+  %5 = or i16 %4, %3
+  %6 = lshr i16 %5, 4
+  %7 = or i16 %6, %5
+  %8 = lshr i16 %7, 8
+  %9 = or i16 %8, %7
+  %10 = mul i16 %9, 3885
+  %11 = lshr i16 %10, 12
+  %12 = zext nneg i16 %11 to i64
+  %13 = getelementptr inbounds nuw i8, ptr @log2_16.table, i64 %12
+  %14 = load i8, ptr %13, align 1
+  %15 = zext i8 %14 to i32
+  ret i32 %15
+}
+
+
+ at log2_128.table = internal unnamed_addr constant [128 x i8] c"\00\0D\01\0E\13&\02\0F\1A\14A!'H\03\10\1E\1B9\15.B[\225(c<Ij\04}\11\18\1F\1C,3:\161/NCPU\\#E6g)Rdp=WJs^kw\05~\0C\12%\19@ G\1D8-Z4b;i|\17+20MOTDfQoVr]v\0B$?F7Yah{*LSenqu\0A>X`zKmt\09_yl\08x\07\06\7F", align 1
+
+define i32 @log2_128(i128 noundef %0) {
+; CHECK-LABEL: @log2_128(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i128 [[TMP0:%.*]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = or i128 [[TMP2]], [[TMP0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i128 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = or i128 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = lshr i128 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = or i128 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = lshr i128 [[TMP7]], 8
+; CHECK-NEXT:    [[TMP9:%.*]] = or i128 [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = lshr i128 [[TMP9]], 16
+; CHECK-NEXT:    [[TMP11:%.*]] = or i128 [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = lshr i128 [[TMP11]], 32
+; CHECK-NEXT:    [[TMP13:%.*]] = or i128 [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = lshr i128 [[TMP13]], 64
+; CHECK-NEXT:    [[TMP15:%.*]] = or i128 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[TMP16:%.*]] = mul i128 [[TMP15]], 2638024179347461332462726661865453437
+; CHECK-NEXT:    [[TMP17:%.*]] = lshr i128 [[TMP16]], 121
+; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw nsw i128 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr @log2_128.table, i64 [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = load i8, ptr [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i8 [[TMP20]] to i32
+; CHECK-NEXT:    ret i32 [[TMP21]]
+;
+  %2 = lshr i128 %0, 1
+  %3 = or i128 %2, %0
+  %4 = lshr i128 %3, 2
+  %5 = or i128 %4, %3
+  %6 = lshr i128 %5, 4
+  %7 = or i128 %6, %5
+  %8 = lshr i128 %7, 8
+  %9 = or i128 %8, %7
+  %10 = lshr i128 %9, 16
+  %11 = or i128 %10, %9
+  %12 = lshr i128 %11, 32
+  %13 = or i128 %12, %11
+  %14 = lshr i128 %13, 64
+  %15 = or i128 %14, %13
+  %16 = mul i128 %15, 2638024179347461332462726661865453437
+  %17 = lshr i128 %16, 121
+  %18 = trunc nuw nsw i128 %17 to i64
+  %19 = getelementptr inbounds nuw i8, ptr @log2_128.table, i64 %18
+  %20 = load i8, ptr %19, align 1
+  %21 = zext i8 %20 to i32
+  ret i32 %21
+}
+
+!0 = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF0]] = !{!"function_entry_count", i64 1000}
+; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 1048575}
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
new file mode 100644
index 0000000000000..4968b01eceee1
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/lower-table-based-log2-negative.ll
@@ -0,0 +1,264 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=aggressive-instcombine -mtriple=x86_64 -mattr=+lzcnt -S < %s | FileCheck %s
+
+ at log2.table = internal unnamed_addr constant [32 x i8] c"\05\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; This is a negative test with a wrong table constant.
+
+define i32 @log2_32(i32 %v) {
+; CHECK-LABEL: @log2_32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP4]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+ at log2_2.table = internal unnamed_addr constant [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_2(i32 %v) {
+; CHECK-LABEL: @log2_32_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_3(i32 %v) {
+; CHECK-LABEL: @log2_32_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 16
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329822
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 16
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329822
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+define i32 @log2_32_4(i32 %v) {
+; CHECK-LABEL: @log2_32_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 26
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_2.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+;; A test with an extern global variable representing the table.
+
+ at table = external global [32 x i8], align 1
+define i32 @log2_32_5(i32 %v) {
+; CHECK-LABEL: @log2_32_5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 26
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 26
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
+
+;; We want only constant tables to be considered as LOG2 ones.
+ at log2_3.table = global [32 x i8] c"\00\09\01\0A\0D\15\02\1D\0B\0E\10\12\16\19\03\1E\08\0C\14\1C\0F\11\18\07\13\1B\17\06\1A\05\04\1F", align 1
+
+;; These are some negative tests with a wrong instruction sequences.
+
+define i32 @log2_32_6(i32 %v) {
+; CHECK-LABEL: @log2_32_6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[V:%.*]], 1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[V]]
+; CHECK-NEXT:    [[SHR1:%.*]] = lshr i32 [[OR]], 2
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHR1]], [[OR]]
+; CHECK-NEXT:    [[SHR3:%.*]] = lshr i32 [[OR2]], 4
+; CHECK-NEXT:    [[OR4:%.*]] = or i32 [[SHR3]], [[OR2]]
+; CHECK-NEXT:    [[SHR5:%.*]] = lshr i32 [[OR4]], 8
+; CHECK-NEXT:    [[OR6:%.*]] = or i32 [[SHR5]], [[OR4]]
+; CHECK-NEXT:    [[SHR7:%.*]] = lshr i32 [[OR6]], 15
+; CHECK-NEXT:    [[OR8:%.*]] = or i32 [[SHR7]], [[OR6]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[OR8]], 130329821
+; CHECK-NEXT:    [[SHR9:%.*]] = lshr i32 [[MUL]], 27
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[SHR9]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP0]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+entry:
+  %shr = lshr i32 %v, 1
+  %or = or i32 %shr, %v
+  %shr1 = lshr i32 %or, 2
+  %or2 = or i32 %shr1, %or
+  %shr3 = lshr i32 %or2, 4
+  %or4 = or i32 %shr3, %or2
+  %shr5 = lshr i32 %or4, 8
+  %or6 = or i32 %shr5, %or4
+  %shr7 = lshr i32 %or6, 15
+  %or8 = or i32 %shr7, %or6
+  %mul = mul i32 %or8, 130329821
+  %shr9 = lshr i32 %mul, 27
+  %idxprom = zext nneg i32 %shr9 to i64
+  %arrayidx = getelementptr inbounds [32 x i8], ptr @log2_3.table, i64 0, i64 %idxprom
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  ret i32 %conv
+}
diff --git a/llvm/test/Transforms/Attributor/nofpclass-frem.ll b/llvm/test/Transforms/Attributor/nofpclass-frem.ll
index 42d9ac7da1cc9..aab6113eab5c9 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-frem.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-frem.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT
 
 define float @ret_frem_ieee(float %arg0, float %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee
 ; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -12,7 +12,7 @@ define float @ret_frem_ieee(float %arg0, float %arg1) #0 {
 }
 
 define float @ret_frem_daz(float %arg0, float %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz
 ; CHECK-SAME: (float [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -22,7 +22,7 @@ define float @ret_frem_daz(float %arg0, float %arg1) #1 {
 }
 
 define float @ret_frem_ieee_noinf_nozero__all(float nofpclass(inf zero) %arg0, float %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_noinf_nozero__all
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_noinf_nozero__all
 ; CHECK-SAME: (float nofpclass(inf zero) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -32,7 +32,7 @@ define float @ret_frem_ieee_noinf_nozero__all(float nofpclass(inf zero) %arg0, f
 }
 
 define float @ret_frem_ieee_nonan_noinf_nozero__all(float nofpclass(nan inf zero) %arg0, float %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_noinf_nozero__all
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_noinf_nozero__all
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -42,7 +42,7 @@ define float @ret_frem_ieee_nonan_noinf_nozero__all(float nofpclass(nan inf zero
 }
 
 define float @ret_frem_ieee_all__nonan_noinf_nozero(float %arg0, float nofpclass(nan inf zero) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_all__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_all__nonan_noinf_nozero
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -52,7 +52,7 @@ define float @ret_frem_ieee_all__nonan_noinf_nozero(float %arg0, float nofpclass
 }
 
 define float @ret_frem_ieee_nonan_noinf_nozero__nonan(float nofpclass(nan inf zero) %arg0, float nofpclass(nan) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_noinf_nozero__nonan
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_noinf_nozero__nonan
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -62,7 +62,7 @@ define float @ret_frem_ieee_nonan_noinf_nozero__nonan(float nofpclass(nan inf ze
 }
 
 define float @ret_frem_ieee_nonan__nonan_noinf_nozero(float nofpclass(nan) %arg0, float nofpclass(nan inf zero) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan__nonan_noinf_nozero
 ; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -72,7 +72,7 @@ define float @ret_frem_ieee_nonan__nonan_noinf_nozero(float nofpclass(nan) %arg0
 }
 
 define float @ret_frem_ieee_nonan_nozero__nonan_noinf(float nofpclass(nan zero) %arg0, float nofpclass(nan inf) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_nozero__nonan_noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_nozero__nonan_noinf
 ; CHECK-SAME: (float nofpclass(nan zero) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -82,7 +82,7 @@ define float @ret_frem_ieee_nonan_nozero__nonan_noinf(float nofpclass(nan zero)
 }
 
 define float @ret_frem_ieee_nonan_noinf__nonan_nozero(float nofpclass(nan inf) %arg0, float nofpclass(nan zero) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_ieee_nonan_noinf__nonan_nozero
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_ieee_nonan_noinf__nonan_nozero
 ; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan zero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -92,7 +92,7 @@ define float @ret_frem_ieee_nonan_noinf__nonan_nozero(float nofpclass(nan inf) %
 }
 
 define float @ret_frem_daz_nonan_nozero__nonan_noinf(float nofpclass(nan zero) %arg0, float nofpclass(nan inf) %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz_nonan_nozero__nonan_noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz_nonan_nozero__nonan_noinf
 ; CHECK-SAME: (float nofpclass(nan zero) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -102,7 +102,7 @@ define float @ret_frem_daz_nonan_nozero__nonan_noinf(float nofpclass(nan zero) %
 }
 
 define float @ret_frem_daz_nonan_noinf__nonan_nozero(float nofpclass(nan inf) %arg0, float nofpclass(nan zero) %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz_nonan_noinf__nonan_nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz_nonan_noinf__nonan_nozero
 ; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan zero) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -112,7 +112,7 @@ define float @ret_frem_daz_nonan_noinf__nonan_nozero(float nofpclass(nan inf) %a
 }
 
 define float @ret_frem_daz_nonan_nozero_nosub__nonan_noinf(float nofpclass(nan zero sub) %arg0, float nofpclass(nan inf) %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz_nonan_nozero_nosub__nonan_noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz_nonan_nozero_nosub__nonan_noinf
 ; CHECK-SAME: (float nofpclass(nan zero sub) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -122,7 +122,7 @@ define float @ret_frem_daz_nonan_nozero_nosub__nonan_noinf(float nofpclass(nan z
 }
 
 define float @ret_frem_daz_nonan_noinf__nonan_nozero_nosub(float nofpclass(nan inf) %arg0, float nofpclass(nan zero sub) %arg1) #1 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_daz_nonan_noinf__nonan_nozero_nosub
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_daz_nonan_noinf__nonan_nozero_nosub
 ; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan zero sub) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -132,7 +132,7 @@ define float @ret_frem_daz_nonan_noinf__nonan_nozero_nosub(float nofpclass(nan i
 }
 
 define float @ret_frem_ieee_nonan_noinf__nonan_noinf(float nofpclass(nan) %arg0, float nofpclass(nan) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_noinf__nonan_noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_noinf__nonan_noinf
 ; CHECK-SAME: (float nofpclass(nan) [[ARG0:%.*]], float nofpclass(nan) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -142,7 +142,7 @@ define float @ret_frem_ieee_nonan_noinf__nonan_noinf(float nofpclass(nan) %arg0,
 }
 
 define float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0, float nofpclass(nan inf zero) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_nozero
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -152,7 +152,7 @@ define float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_nozero(float nofpcla
 }
 
 define float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0, float nofpclass(nan inf zero) %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -162,7 +162,7 @@ define float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero(float nofpclas
 }
 
 define float @ret_frem_dapz_nonan_noinf_nozero__nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0, float nofpclass(nan inf zero) %arg1) #2 {
-; CHECK-LABEL: define float @ret_frem_dapz_nonan_noinf_nozero__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_dapz_nonan_noinf_nozero__nonan_noinf_nozero
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR2:[0-9]+]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -172,7 +172,7 @@ define float @ret_frem_dapz_nonan_noinf_nozero__nonan_noinf_nozero(float nofpcla
 }
 
 define float @ret_frem_dynamic_nonan_noinf_nozero__nonan_noinf_nozero(float nofpclass(nan inf zero) %arg0, float nofpclass(nan inf zero) %arg1) #3 {
-; CHECK-LABEL: define float @ret_frem_dynamic_nonan_noinf_nozero__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_dynamic_nonan_noinf_nozero__nonan_noinf_nozero
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR3:[0-9]+]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -182,7 +182,7 @@ define float @ret_frem_dynamic_nonan_noinf_nozero__nonan_noinf_nozero(float nofp
 }
 
 define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noinf(float nofpclass(nan inf) %arg0, float nofpclass(nan inf) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noinf
 ; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -192,7 +192,7 @@ define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noinf(float nofpclass
 }
 
 define float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_noinf(float nofpclass(nan inf zero) %arg0, float nofpclass(nan inf) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_noinf
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -202,7 +202,7 @@ define float @ret_frem_ieee_nonan_noinf_nozero__nonan_noinf_noinf(float nofpclas
 }
 
 define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_nozero(float nofpclass(nan inf) %arg0, float nofpclass(nan inf zero) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_nozero
 ; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -212,7 +212,7 @@ define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_nozero(float nofpclas
 }
 
 define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noposzero(float nofpclass(nan inf) %arg0, float nofpclass(nan pzero) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noposzero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noposzero
 ; CHECK-SAME: (float nofpclass(nan inf) [[ARG0:%.*]], float nofpclass(nan pzero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -222,7 +222,7 @@ define float @ret_frem_ieee_nonan_noinf_noinf__nonan_noinf_noposzero(float nofpc
 }
 
 define float @ret_frem_ieee_nonan_noinf_noposzero__nonan_noinf_noinf(float nofpclass(nan pzero) %arg0, float nofpclass(nan inf) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nonan_noinf_noposzero__nonan_noinf_noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nonan_noinf_noposzero__nonan_noinf_noinf
 ; CHECK-SAME: (float nofpclass(nan pzero) [[ARG0:%.*]], float nofpclass(nan inf) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -232,7 +232,7 @@ define float @ret_frem_ieee_nonan_noinf_noposzero__nonan_noinf_noinf(float nofpc
 }
 
 define float @ret_frem_ieee_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub(float nofpclass(nan inf zero sub) %arg0, float nofpclass(nan inf zero sub) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_ieee_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_ieee_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
 ; CHECK-SAME: (float nofpclass(nan inf zero sub) [[ARG0:%.*]], float nofpclass(nan inf zero sub) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -243,7 +243,7 @@ define float @ret_frem_ieee_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub(f
 
 ; Denormal mode doesn't matter because sources are nofpclass(sub)
 define float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub(float nofpclass(nan inf zero sub) %arg0, float nofpclass(nan inf zero sub) %arg1) #1 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
 ; CHECK-SAME: (float nofpclass(nan inf zero sub) [[ARG0:%.*]], float nofpclass(nan inf zero sub) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -253,7 +253,7 @@ define float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub(fl
 }
 
 define float @ret_frem_dapz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub(float nofpclass(nan inf zero sub) %arg0, float nofpclass(nan inf zero sub) %arg1) #2 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_dapz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_dapz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
 ; CHECK-SAME: (float nofpclass(nan inf zero sub) [[ARG0:%.*]], float nofpclass(nan inf zero sub) [[ARG1:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -263,7 +263,7 @@ define float @ret_frem_dapz_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub(f
 }
 
 define float @ret_frem_dynamic_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub(float nofpclass(nan inf zero sub) %arg0, float nofpclass(nan inf zero sub) %arg1) #3 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_dynamic_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_dynamic_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosub
 ; CHECK-SAME: (float nofpclass(nan inf zero sub) [[ARG0:%.*]], float nofpclass(nan inf zero sub) [[ARG1:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -274,7 +274,7 @@ define float @ret_frem_dynamic_nonan_noinf_nozero_nosub__nonan_noinf_nozero_nosu
 
 ; Missing no-subnormal on lhs
 define float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero_nosub(float nofpclass(nan inf zero) %arg0, float nofpclass(nan inf zero sub) %arg1) #1 {
-; CHECK-LABEL: define nofpclass(nan) float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero_nosub
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero_nosub
 ; CHECK-SAME: (float nofpclass(nan inf zero) [[ARG0:%.*]], float nofpclass(nan inf zero sub) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -285,7 +285,7 @@ define float @ret_frem_daz_nonan_noinf_nozero__nonan_noinf_nozero_nosub(float no
 
 ; Missing no-subnormal on lhs
 define float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero(float nofpclass(nan inf zero sub) %arg0, float nofpclass(nan inf zero) %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero
 ; CHECK-SAME: (float nofpclass(nan inf zero sub) [[ARG0:%.*]], float nofpclass(nan inf zero) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -296,7 +296,7 @@ define float @ret_frem_daz_nonan_noinf_nozero_nosub__nonan_noinf_nozero(float no
 
 ; should be able to infer noinf
 define float @ret_frem_ieee_noinf__nozero(float nofpclass(inf) %arg0, float nofpclass(zero) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_noinf__nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_noinf__nozero
 ; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -306,7 +306,7 @@ define float @ret_frem_ieee_noinf__nozero(float nofpclass(inf) %arg0, float nofp
 }
 
 define float @ret_frem_ieee_all__nozero(float %arg0, float nofpclass(zero) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_all__nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_all__nozero
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -316,7 +316,7 @@ define float @ret_frem_ieee_all__nozero(float %arg0, float nofpclass(zero) %arg1
 }
 
 define float @ret_frem_ieee_noinf__all(float nofpclass(inf) %arg0, float %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_noinf__all
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_noinf__all
 ; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -326,7 +326,7 @@ define float @ret_frem_ieee_noinf__all(float nofpclass(inf) %arg0, float %arg1)
 }
 
 define float @ret_frem_ieee_nozero__noinf(float nofpclass(zero) %arg0, float nofpclass(inf) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_ieee_nozero__noinf
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_ieee_nozero__noinf
 ; CHECK-SAME: (float nofpclass(zero) [[ARG0:%.*]], float nofpclass(inf) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -336,7 +336,7 @@ define float @ret_frem_ieee_nozero__noinf(float nofpclass(zero) %arg0, float nof
 }
 
 define float @ret_frem_daz_noinf__nozero(float nofpclass(inf) %arg0, float nofpclass(zero) %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz_noinf__nozero
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz_noinf__nozero
 ; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(zero) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -346,7 +346,7 @@ define float @ret_frem_daz_noinf__nozero(float nofpclass(inf) %arg0, float nofpc
 }
 
 define float @ret_frem_daz_noinf__nozero_nosub(float nofpclass(inf) %arg0, float nofpclass(zero sub) %arg1) #1 {
-; CHECK-LABEL: define float @ret_frem_daz_noinf__nozero_nosub
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_daz_noinf__nozero_nosub
 ; CHECK-SAME: (float nofpclass(inf) [[ARG0:%.*]], float nofpclass(zero sub) [[ARG1:%.*]]) #[[ATTR1]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -366,7 +366,7 @@ define float @ret_frem_same_operands(float noundef %arg) #0 {
 }
 
 define float @ret_frem_same_operands_maybe_undef(float %arg) #0 {
-; CHECK-LABEL: define float @ret_frem_same_operands_maybe_undef
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_same_operands_maybe_undef
 ; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -479,7 +479,7 @@ define float @ret_frem_same_operands_nonan_noinf_nozero_nosub__dynamic(float nou
 }
 
 define float @ret_frem_no_neg_lhs(float nofpclass(ninf nsub nnorm) %arg0, float %arg1) #0 {
-; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_frem_no_neg_lhs
+; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_frem_no_neg_lhs
 ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -489,7 +489,7 @@ define float @ret_frem_no_neg_lhs(float nofpclass(ninf nsub nnorm) %arg0, float
 }
 
 define float @ret_frem_no_neg_rhs(float %arg0, float nofpclass(ninf nsub nnorm) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_no_neg_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_no_neg_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -499,7 +499,7 @@ define float @ret_frem_no_neg_rhs(float %arg0, float nofpclass(ninf nsub nnorm)
 }
 
 define float @ret_frem_no_neg_nzero_rhs(float %arg0, float nofpclass(ninf nsub nnorm nzero) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_no_neg_nzero_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_no_neg_nzero_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -509,7 +509,7 @@ define float @ret_frem_no_neg_nzero_rhs(float %arg0, float nofpclass(ninf nsub n
 }
 
 define float @ret_frem_no_neg(float nofpclass(ninf nsub nnorm) %arg0, float nofpclass(ninf nsub nnorm) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_frem_no_neg
+; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_frem_no_neg
 ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nsub nnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -519,7 +519,7 @@ define float @ret_frem_no_neg(float nofpclass(ninf nsub nnorm) %arg0, float nofp
 }
 
 define float @ret_frem_no_neg_nzero(float nofpclass(ninf nsub nnorm nzero) %arg0, float nofpclass(ninf nsub nnorm nzero) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_frem_no_neg_nzero
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_frem_no_neg_nzero
 ; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -529,7 +529,7 @@ define float @ret_frem_no_neg_nzero(float nofpclass(ninf nsub nnorm nzero) %arg0
 }
 
 define float @ret_frem_no_neg_rhs_no_nzero(float nofpclass(ninf nsub nnorm) %arg0, float nofpclass(ninf nsub nnorm nzero) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(ninf nsub nnorm) float @ret_frem_no_neg_rhs_no_nzero
+; CHECK-LABEL: define nofpclass(inf nsub nnorm) float @ret_frem_no_neg_rhs_no_nzero
 ; CHECK-SAME: (float nofpclass(ninf nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf nzero nsub nnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -539,7 +539,7 @@ define float @ret_frem_no_neg_rhs_no_nzero(float nofpclass(ninf nsub nnorm) %arg
 }
 
 define float @ret_frem_no_neg_no_zero_rhs(float nofpclass(ninf nsub nnorm nzero) %arg0, float nofpclass(ninf nsub nnorm zero) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_frem_no_neg_no_zero_rhs
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_frem_no_neg_no_zero_rhs
 ; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ARG0:%.*]], float nofpclass(ninf zero nsub nnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -549,7 +549,7 @@ define float @ret_frem_no_neg_no_zero_rhs(float nofpclass(ninf nsub nnorm nzero)
 }
 
 define float @ret_frem_no_pos_lhs(float nofpclass(pinf psub pnorm pzero) %arg0, float %arg1) #0 {
-; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_frem_no_pos_lhs
+; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_frem_no_pos_lhs
 ; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -559,7 +559,7 @@ define float @ret_frem_no_pos_lhs(float nofpclass(pinf psub pnorm pzero) %arg0,
 }
 
 define float @ret_frem_no_pos_rhs(float %arg0, float nofpclass(pinf psub pnorm pzero) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_no_pos_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_no_pos_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(pinf pzero psub pnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -569,7 +569,7 @@ define float @ret_frem_no_pos_rhs(float %arg0, float nofpclass(pinf psub pnorm p
 }
 
 define float @ret_frem_no_pos_zero_lhs(float nofpclass(pinf psub pnorm) %arg0, float %arg1) #0 {
-; CHECK-LABEL: define nofpclass(pinf psub pnorm) float @ret_frem_no_pos_zero_lhs
+; CHECK-LABEL: define nofpclass(inf psub pnorm) float @ret_frem_no_pos_zero_lhs
 ; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -579,7 +579,7 @@ define float @ret_frem_no_pos_zero_lhs(float nofpclass(pinf psub pnorm) %arg0, f
 }
 
 define float @ret_frem_no_pos_zero_rhs(float %arg0, float nofpclass(pinf psub pnorm) %arg1) #0 {
-; CHECK-LABEL: define float @ret_frem_no_pos_zero_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_no_pos_zero_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -589,7 +589,7 @@ define float @ret_frem_no_pos_zero_rhs(float %arg0, float nofpclass(pinf psub pn
 }
 
 define float @ret_frem_no_pos(float nofpclass(pinf psub pnorm) %arg0, float nofpclass(pinf psub pnorm) %arg1) #0 {
-; CHECK-LABEL: define nofpclass(pinf psub pnorm) float @ret_frem_no_pos
+; CHECK-LABEL: define nofpclass(inf psub pnorm) float @ret_frem_no_pos
 ; CHECK-SAME: (float nofpclass(pinf psub pnorm) [[ARG0:%.*]], float nofpclass(pinf psub pnorm) [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -609,7 +609,7 @@ define float @ret_frem_f32_known_zero_or_nan_lhs(float nofpclass(inf norm sub) %
 }
 
 define float @ret_frem_f32_known_zero_or_nan_rhs(float %arg0, float nofpclass(inf norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_zero_or_nan_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_zero_or_nan_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(inf sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -629,7 +629,7 @@ define float @ret_frem_f32_known_zero_lhs(float nofpclass(nan inf norm sub) %arg
 }
 
 define float @ret_frem_f32_known_zero_rhs(float %arg0, float nofpclass(nan inf norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_zero_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_zero_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan inf sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -649,7 +649,7 @@ define float @ret_frem_f32_known_pzero_or_nan_lhs(float nofpclass(inf norm sub n
 }
 
 define float @ret_frem_f32_known_pzero_or_nan_rhs(float %arg0, float nofpclass(inf norm sub nzero) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_pzero_or_nan_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_pzero_or_nan_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(inf nzero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -669,7 +669,7 @@ define float @ret_frem_f32_known_pzero_lhs(float nofpclass(nan inf norm sub nzer
 }
 
 define float @ret_frem_f32_known_pzero_rhs(float %arg0, float nofpclass(nan inf norm sub nzero) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_pzero_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_pzero_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan inf nzero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -689,7 +689,7 @@ define float @ret_frem_f32_known_nzero_or_nan_lhs(float nofpclass(inf norm sub p
 }
 
 define float @ret_frem_f32_known_nzero_or_nan_rhs(float %arg0, float nofpclass(inf norm sub pzero) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_nzero_or_nan_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_nzero_or_nan_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(inf pzero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -709,7 +709,7 @@ define float @ret_frem_f32_known_nzero_lhs(float nofpclass(nan inf norm sub pzer
 }
 
 define float @ret_frem_f32_known_nzero_rhs(float %arg0, float nofpclass(nan inf norm sub pzero) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_nzero_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_nzero_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan inf pzero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -719,7 +719,7 @@ define float @ret_frem_f32_known_nzero_rhs(float %arg0, float nofpclass(nan inf
 }
 
 define float @ret_frem_f32_known_inf_or_nan_lhs(float nofpclass(zero norm sub) %arg0, float %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_inf_or_nan_lhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_inf_or_nan_lhs
 ; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -729,7 +729,7 @@ define float @ret_frem_f32_known_inf_or_nan_lhs(float nofpclass(zero norm sub) %
 }
 
 define float @ret_frem_f32_known_inf_or_nan_rhs(float %arg0, float nofpclass(zero norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_inf_or_nan_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_inf_or_nan_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -739,7 +739,7 @@ define float @ret_frem_f32_known_inf_or_nan_rhs(float %arg0, float nofpclass(zer
 }
 
 define float @ret_frem_f32_known_inf_lhs(float nofpclass(nan zero norm sub) %arg0, float %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_inf_lhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_inf_lhs
 ; CHECK-SAME: (float nofpclass(nan zero sub norm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -749,7 +749,7 @@ define float @ret_frem_f32_known_inf_lhs(float nofpclass(nan zero norm sub) %arg
 }
 
 define float @ret_frem_f32_known_inf_rhs(float %arg0, float nofpclass(nan zero norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_inf_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_inf_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(nan zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -759,7 +759,7 @@ define float @ret_frem_f32_known_inf_rhs(float %arg0, float nofpclass(nan zero n
 }
 
 define float @ret_frem_f32_known_pinf_or_nan_lhs(float nofpclass(ninf zero norm sub) %arg0, float %arg1) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_frem_f32_known_pinf_or_nan_lhs
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_frem_f32_known_pinf_or_nan_lhs
 ; CHECK-SAME: (float nofpclass(ninf zero sub norm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -769,7 +769,7 @@ define float @ret_frem_f32_known_pinf_or_nan_lhs(float nofpclass(ninf zero norm
 }
 
 define float @ret_frem_f32_known_pinf_or_nan_rhs(float %arg0, float nofpclass(ninf zero norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_pinf_or_nan_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_pinf_or_nan_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(ninf zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -779,7 +779,7 @@ define float @ret_frem_f32_known_pinf_or_nan_rhs(float %arg0, float nofpclass(ni
 }
 
 define float @ret_frem_f32_known_ninf_or_nan_lhs(float nofpclass(pinf zero norm sub) %arg0, float %arg1) {
-; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_frem_f32_known_ninf_or_nan_lhs
+; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_frem_f32_known_ninf_or_nan_lhs
 ; CHECK-SAME: (float nofpclass(pinf zero sub norm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -789,7 +789,7 @@ define float @ret_frem_f32_known_ninf_or_nan_lhs(float nofpclass(pinf zero norm
 }
 
 define float @ret_frem_f32_known_ninf_or_nan_rhs(float %arg0, float nofpclass(pinf zero norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_frem_f32_known_ninf_or_nan_rhs
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_f32_known_ninf_or_nan_rhs
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(pinf zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -800,7 +800,7 @@ define float @ret_frem_f32_known_ninf_or_nan_rhs(float %arg0, float nofpclass(pi
 
 ; -> nan
 define float @ret_known_inf_frem_known_zero(float nofpclass(nan norm sub zero) %arg0, float nofpclass(nan inf norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_frem_known_zero
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_frem_known_zero
 ; CHECK-SAME: (float nofpclass(nan zero sub norm) [[ARG0:%.*]], float nofpclass(nan inf sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -811,7 +811,7 @@ define float @ret_known_inf_frem_known_zero(float nofpclass(nan norm sub zero) %
 
 ; -> nan
 define float @ret_known_inf_or_nan_frem_known_zero(float nofpclass(norm sub zero) %arg0, float nofpclass(nan inf norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_or_nan_frem_known_zero
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_or_nan_frem_known_zero
 ; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]], float nofpclass(nan inf sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -822,7 +822,7 @@ define float @ret_known_inf_or_nan_frem_known_zero(float nofpclass(norm sub zero
 
 ; -> nan
 define float @ret_known_inf_frem_known_zero_or_nan(float nofpclass(nan norm sub zero) %arg0, float nofpclass(inf norm sub) %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_frem_known_zero_or_nan
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_frem_known_zero_or_nan
 ; CHECK-SAME: (float nofpclass(nan zero sub norm) [[ARG0:%.*]], float nofpclass(inf sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -833,7 +833,7 @@ define float @ret_known_inf_frem_known_zero_or_nan(float nofpclass(nan norm sub
 
 ; -> nan
 define float @ret_known_ninf_frem_known_zero(float nofpclass(nan pinf norm sub zero) %arg0, float nofpclass(nan inf norm sub) %arg1) {
-; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_known_ninf_frem_known_zero
+; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_known_ninf_frem_known_zero
 ; CHECK-SAME: (float nofpclass(nan pinf zero sub norm) [[ARG0:%.*]], float nofpclass(nan inf sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -844,7 +844,7 @@ define float @ret_known_ninf_frem_known_zero(float nofpclass(nan pinf norm sub z
 
 ; -> nan
 define float @ret_known_pinf_frem_known_zero(float nofpclass(nan ninf norm sub zero) %arg0, float nofpclass(nan inf norm sub) %arg1) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_known_pinf_frem_known_zero
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_known_pinf_frem_known_zero
 ; CHECK-SAME: (float nofpclass(nan ninf zero sub norm) [[ARG0:%.*]], float nofpclass(nan inf sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -887,7 +887,7 @@ define float @ret_known_zero_or_nan_frem_known_inf(float nofpclass(inf norm sub)
 }
 
 define float @ret_frem_lhs_known_positive_or_nan(float %lhs, float %rhs) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_frem_lhs_known_positive_or_nan
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_frem_lhs_known_positive_or_nan
 ; CHECK-SAME: (float [[LHS:%.*]], float [[RHS:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[LHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[LHS]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    [[MUL:%.*]] = frem float [[LHS_FABS]], [[RHS]]
@@ -899,7 +899,7 @@ define float @ret_frem_lhs_known_positive_or_nan(float %lhs, float %rhs) {
 }
 
 define float @ret_frem_rhs_known_positive_or_nan(float %lhs, float %rhs) {
-; CHECK-LABEL: define float @ret_frem_rhs_known_positive_or_nan
+; CHECK-LABEL: define nofpclass(inf) float @ret_frem_rhs_known_positive_or_nan
 ; CHECK-SAME: (float [[LHS:%.*]], float [[RHS:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[RHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[RHS]]) #[[ATTR6]]
 ; CHECK-NEXT:    [[MUL:%.*]] = frem float [[LHS]], [[RHS_FABS]]
@@ -911,7 +911,7 @@ define float @ret_frem_rhs_known_positive_or_nan(float %lhs, float %rhs) {
 }
 
 define float @ret_frem_both_signs_positive_or_nan(float %lhs, float %rhs) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_frem_both_signs_positive_or_nan
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_frem_both_signs_positive_or_nan
 ; CHECK-SAME: (float [[LHS:%.*]], float [[RHS:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[LHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[LHS]]) #[[ATTR6]]
 ; CHECK-NEXT:    [[RHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[RHS]]) #[[ATTR6]]
@@ -925,7 +925,7 @@ define float @ret_frem_both_signs_positive_or_nan(float %lhs, float %rhs) {
 }
 
 define float @ret_frem_both_signs_negative_or_nan(float %lhs, float %rhs) {
-; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_frem_both_signs_negative_or_nan
+; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_frem_both_signs_negative_or_nan
 ; CHECK-SAME: (float [[LHS:%.*]], float [[RHS:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[LHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[LHS]]) #[[ATTR6]]
 ; CHECK-NEXT:    [[RHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[RHS]]) #[[ATTR6]]
@@ -943,7 +943,7 @@ define float @ret_frem_both_signs_negative_or_nan(float %lhs, float %rhs) {
 }
 
 define float @ret_frem_lhs_negative_rhs_positive(float %lhs, float %rhs) {
-; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_frem_lhs_negative_rhs_positive
+; CHECK-LABEL: define nofpclass(inf pzero psub pnorm) float @ret_frem_lhs_negative_rhs_positive
 ; CHECK-SAME: (float [[LHS:%.*]], float [[RHS:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[LHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[LHS]]) #[[ATTR6]]
 ; CHECK-NEXT:    [[RHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[RHS]]) #[[ATTR6]]
@@ -959,7 +959,7 @@ define float @ret_frem_lhs_negative_rhs_positive(float %lhs, float %rhs) {
 }
 
 define float @ret_frem_rhs_negative_lhs_positive(float %lhs, float %rhs) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @ret_frem_rhs_negative_lhs_positive
+; CHECK-LABEL: define nofpclass(inf nzero nsub nnorm) float @ret_frem_rhs_negative_lhs_positive
 ; CHECK-SAME: (float [[LHS:%.*]], float [[RHS:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[LHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[LHS]]) #[[ATTR6]]
 ; CHECK-NEXT:    [[RHS_FABS:%.*]] = call float @llvm.fabs.f32(float [[RHS]]) #[[ATTR6]]
@@ -975,7 +975,7 @@ define float @ret_frem_rhs_negative_lhs_positive(float %lhs, float %rhs) {
 }
 
 define float @ret_known_inf_frem_known_inf(float nofpclass(norm sub zero nan) %arg0, float nofpclass(norm sub zero nan) %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_frem_known_inf
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_frem_known_inf
 ; CHECK-SAME: (float nofpclass(nan zero sub norm) [[ARG0:%.*]], float nofpclass(nan zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -985,7 +985,7 @@ define float @ret_known_inf_frem_known_inf(float nofpclass(norm sub zero nan) %a
 }
 
 define float @ret_known_inf_frem_known_inf_or_nan(float nofpclass(norm sub zero nan) %arg0, float nofpclass(norm sub zero) %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_frem_known_inf_or_nan
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_frem_known_inf_or_nan
 ; CHECK-SAME: (float nofpclass(nan zero sub norm) [[ARG0:%.*]], float nofpclass(zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -995,7 +995,7 @@ define float @ret_known_inf_frem_known_inf_or_nan(float nofpclass(norm sub zero
 }
 
 define float @ret_known_inf_or_nan_frem_known_inf(float nofpclass(norm sub zero) %arg0, float nofpclass(norm sub zero nan) %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_or_nan_frem_known_inf
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_or_nan_frem_known_inf
 ; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]], float nofpclass(nan zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -1035,7 +1035,7 @@ define float @ret_known_zero_or_nan_frem_known_zero(float nofpclass(inf norm sub
 }
 
 define float @ret_known_inf_or_nan_frem_unknown(float nofpclass(norm sub zero) %arg0, float %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_or_nan_frem_unknown
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_or_nan_frem_unknown
 ; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]], float [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -1045,7 +1045,7 @@ define float @ret_known_inf_or_nan_frem_unknown(float nofpclass(norm sub zero) %
 }
 
 define float @ret_unknown_frem_known_inf_or_nan(float %arg0, float nofpclass(norm sub zero) %arg1) {
-; CHECK-LABEL: define float @ret_unknown_frem_known_inf_or_nan
+; CHECK-LABEL: define nofpclass(inf) float @ret_unknown_frem_known_inf_or_nan
 ; CHECK-SAME: (float [[ARG0:%.*]], float nofpclass(zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
@@ -1055,7 +1055,7 @@ define float @ret_unknown_frem_known_inf_or_nan(float %arg0, float nofpclass(nor
 }
 
 define float @ret_known_inf_or_nan_frem_known_inf_or_nan(float nofpclass(norm sub zero) %arg0, float nofpclass(norm sub zero) %arg1) {
-; CHECK-LABEL: define float @ret_known_inf_or_nan_frem_known_inf_or_nan
+; CHECK-LABEL: define nofpclass(inf) float @ret_known_inf_or_nan_frem_known_inf_or_nan
 ; CHECK-SAME: (float nofpclass(zero sub norm) [[ARG0:%.*]], float nofpclass(zero sub norm) [[ARG1:%.*]]) #[[ATTR4]] {
 ; CHECK-NEXT:    [[FREM:%.*]] = frem float [[ARG0]], [[ARG1]]
 ; CHECK-NEXT:    ret float [[FREM]]
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 507d0f3ff98f3..efe0d7a0a0922 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -3244,7 +3244,7 @@ define float @fadd_double_no_nonsub_nzero__ieee_dynamic(float noundef nofpclass(
 
 define float @fadd_double_known_positive_nonsub_ieee(float noundef nofpclass(ninf nnorm sub zero) %arg) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define noundef nofpclass(ninf zero nsub nnorm) float @fadd_double_known_positive_nonsub_ieee
+; CHECK-LABEL: define noundef nofpclass(ninf zero sub nnorm) float @fadd_double_known_positive_nonsub_ieee
 ; CHECK-SAME: (float noundef nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3254,7 +3254,7 @@ define float @fadd_double_known_positive_nonsub_ieee(float noundef nofpclass(nin
 }
 
 define float @fadd_double_known_positive_nonsub__ieee_daz(float noundef nofpclass(ninf nnorm sub zero) %arg) #2 {
-; CHECK-LABEL: define noundef nofpclass(ninf zero nsub nnorm) float @fadd_double_known_positive_nonsub__ieee_daz
+; CHECK-LABEL: define noundef nofpclass(ninf zero sub nnorm) float @fadd_double_known_positive_nonsub__ieee_daz
 ; CHECK-SAME: (float noundef nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3265,7 +3265,7 @@ define float @fadd_double_known_positive_nonsub__ieee_daz(float noundef nofpclas
 
 define float @fadd_double_known_positive_nonsub__ftz_daz(float noundef nofpclass(ninf nnorm sub zero) %arg) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn denormal_fpenv(preservesign) memory(none)
-; CHECK-LABEL: define noundef nofpclass(ninf zero nsub nnorm) float @fadd_double_known_positive_nonsub__ftz_daz
+; CHECK-LABEL: define noundef nofpclass(ninf zero sub nnorm) float @fadd_double_known_positive_nonsub__ftz_daz
 ; CHECK-SAME: (float noundef nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3275,7 +3275,7 @@ define float @fadd_double_known_positive_nonsub__ftz_daz(float noundef nofpclass
 }
 
 define float @fadd_double_known_positive_nonsub__ieee_dynamic(float noundef nofpclass(ninf nnorm sub zero) %arg) #9 {
-; CHECK-LABEL: define noundef nofpclass(ninf zero nsub nnorm) float @fadd_double_known_positive_nonsub__ieee_dynamic
+; CHECK-LABEL: define noundef nofpclass(ninf zero sub nnorm) float @fadd_double_known_positive_nonsub__ieee_dynamic
 ; CHECK-SAME: (float noundef nofpclass(ninf zero sub nnorm) [[ARG:%.*]]) #[[ATTR17]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3286,7 +3286,7 @@ define float @fadd_double_known_positive_nonsub__ieee_dynamic(float noundef nofp
 
 define float @fadd_double_known_negative_nonsub_ieee(float noundef nofpclass(pinf pnorm sub zero) %arg) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define noundef nofpclass(pinf zero psub pnorm) float @fadd_double_known_negative_nonsub_ieee
+; CHECK-LABEL: define noundef nofpclass(pinf zero sub pnorm) float @fadd_double_known_negative_nonsub_ieee
 ; CHECK-SAME: (float noundef nofpclass(pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3296,7 +3296,7 @@ define float @fadd_double_known_negative_nonsub_ieee(float noundef nofpclass(pin
 }
 
 define float @fadd_double_known_negative_nonsub__ieee_daz(float noundef nofpclass(pinf pnorm sub zero) %arg) #2 {
-; CHECK-LABEL: define noundef nofpclass(pinf zero psub pnorm) float @fadd_double_known_negative_nonsub__ieee_daz
+; CHECK-LABEL: define noundef nofpclass(pinf zero sub pnorm) float @fadd_double_known_negative_nonsub__ieee_daz
 ; CHECK-SAME: (float noundef nofpclass(pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR11]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3307,7 +3307,7 @@ define float @fadd_double_known_negative_nonsub__ieee_daz(float noundef nofpclas
 
 define float @fadd_double_known_negative_nonsub__ftz_daz(float noundef nofpclass(pinf pnorm sub zero) %arg) #0 {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn denormal_fpenv(preservesign) memory(none)
-; CHECK-LABEL: define noundef nofpclass(pinf zero psub pnorm) float @fadd_double_known_negative_nonsub__ftz_daz
+; CHECK-LABEL: define noundef nofpclass(pinf zero sub pnorm) float @fadd_double_known_negative_nonsub__ftz_daz
 ; CHECK-SAME: (float noundef nofpclass(pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR10]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3317,7 +3317,7 @@ define float @fadd_double_known_negative_nonsub__ftz_daz(float noundef nofpclass
 }
 
 define float @fadd_double_known_negative_nonsub_dynamic(float noundef nofpclass(pinf pnorm sub zero) %arg) #9 {
-; CHECK-LABEL: define noundef nofpclass(pinf zero psub pnorm) float @fadd_double_known_negative_nonsub_dynamic
+; CHECK-LABEL: define noundef nofpclass(pinf zero sub pnorm) float @fadd_double_known_negative_nonsub_dynamic
 ; CHECK-SAME: (float noundef nofpclass(pinf zero sub pnorm) [[ARG:%.*]]) #[[ATTR17]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ARG]], [[ARG]]
 ; CHECK-NEXT:    ret float [[ADD]]
@@ -3746,7 +3746,7 @@ define float @fadd_double_no_zero_or_sub__output_only_is_ftpz(float noundef nofp
 ; Know there cannot be underflow, infer nofpclass(zero)
 define half @known_positive_or_nan__fadd__known_positive_normal_or_inf(half nofpclass(ninf nnorm nsub nzero) %known.positive.or.nan, half nofpclass(ninf nnorm sub zero) %known.pnorm.or.pinf.or.nan) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) half @known_positive_or_nan__fadd__known_positive_normal_or_inf
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) half @known_positive_or_nan__fadd__known_positive_normal_or_inf
 ; CHECK-SAME: (half nofpclass(ninf nzero nsub nnorm) [[KNOWN_POSITIVE_OR_NAN:%.*]], half nofpclass(ninf zero sub nnorm) [[KNOWN_PNORM_OR_PINF_OR_NAN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd half [[KNOWN_POSITIVE_OR_NAN]], [[KNOWN_PNORM_OR_PINF_OR_NAN]]
 ; CHECK-NEXT:    ret half [[ADD]]
@@ -3758,7 +3758,7 @@ define half @known_positive_or_nan__fadd__known_positive_normal_or_inf(half nofp
 ; Know there cannot be underflow, infer nofpclass(zero)
 define half @known_positive_normal_or_inf__fadd__known_positive_or_nan(half nofpclass(ninf nnorm sub zero) %known.pnorm.or.pinf.or.nan, half nofpclass(ninf nnorm nsub nzero) %known.positive.or.nan) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define nofpclass(ninf zero nsub nnorm) half @known_positive_normal_or_inf__fadd__known_positive_or_nan
+; CHECK-LABEL: define nofpclass(ninf zero sub nnorm) half @known_positive_normal_or_inf__fadd__known_positive_or_nan
 ; CHECK-SAME: (half nofpclass(ninf zero sub nnorm) [[KNOWN_PNORM_OR_PINF_OR_NAN:%.*]], half nofpclass(ninf nzero nsub nnorm) [[KNOWN_POSITIVE_OR_NAN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd half [[KNOWN_PNORM_OR_PINF_OR_NAN]], [[KNOWN_POSITIVE_OR_NAN]]
 ; CHECK-NEXT:    ret half [[ADD]]
@@ -3770,7 +3770,7 @@ define half @known_positive_normal_or_inf__fadd__known_positive_or_nan(half nofp
 ; Know there cannot be underflow, infer nofpclass(zero)
 define half @known_negative_or_nan__fadd__known_negative_normal_or_inf(half nofpclass(pinf pnorm psub pzero) %known.negative.or.nan, half nofpclass(pinf pnorm sub zero) %known.nnorm.or.ninf.or.nan) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define nofpclass(pinf zero psub pnorm) half @known_negative_or_nan__fadd__known_negative_normal_or_inf
+; CHECK-LABEL: define nofpclass(pinf zero sub pnorm) half @known_negative_or_nan__fadd__known_negative_normal_or_inf
 ; CHECK-SAME: (half nofpclass(pinf pzero psub pnorm) [[KNOWN_NEGATIVE_OR_NAN:%.*]], half nofpclass(pinf zero sub pnorm) [[KNOWN_NNORM_OR_NINF_OR_NAN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd half [[KNOWN_NEGATIVE_OR_NAN]], [[KNOWN_NNORM_OR_NINF_OR_NAN]]
 ; CHECK-NEXT:    ret half [[ADD]]
@@ -3782,7 +3782,7 @@ define half @known_negative_or_nan__fadd__known_negative_normal_or_inf(half nofp
 ; Know there cannot be underflow, infer nofpclass(zero)
 define half @known_negative_normal_or_inf__fadd__known_negative_or_nan(half nofpclass(pinf pnorm sub zero) %known.nnorm.or.ninf.or.nan, half nofpclass(pinf pnorm psub pzero) %known.negative.or.nan) {
 ; CHECK: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
-; CHECK-LABEL: define nofpclass(pinf zero psub pnorm) half @known_negative_normal_or_inf__fadd__known_negative_or_nan
+; CHECK-LABEL: define nofpclass(pinf zero sub pnorm) half @known_negative_normal_or_inf__fadd__known_negative_or_nan
 ; CHECK-SAME: (half nofpclass(pinf zero sub pnorm) [[KNOWN_NNORM_OR_NINF_OR_NAN:%.*]], half nofpclass(pinf pzero psub pnorm) [[KNOWN_NEGATIVE_OR_NAN:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd half [[KNOWN_NNORM_OR_NINF_OR_NAN]], [[KNOWN_NEGATIVE_OR_NAN]]
 ; CHECK-NEXT:    ret half [[ADD]]
diff --git a/llvm/test/Transforms/ForcedFunctionAttrs/forced.ll b/llvm/test/Transforms/ForcedFunctionAttrs/forced.ll
index ae68f06654589..3c1497e407a21 100644
--- a/llvm/test/Transforms/ForcedFunctionAttrs/forced.ll
+++ b/llvm/test/Transforms/ForcedFunctionAttrs/forced.ll
@@ -8,6 +8,9 @@
 ; RUN: opt < %s -S -passes=verify,forceattrs,verify -force-remove-attribute noinline | FileCheck %s --check-prefix=CHECK-REMOVE-ALL
 ; RUN: opt < %s -S -passes=verify,forceattrs,verify -force-attribute alwaysinline | FileCheck %s --check-prefix=CHECK-ALWAYSINLINE-ALL
 ; RUN: opt < %s -S -passes=verify,forceattrs,verify -force-attribute noinline | FileCheck %s --check-prefix=CHECK-NOINLINE-ALL
+; RUN: opt < %s -S -passes=verify,forceattrs,verify -force-attribute optnone | FileCheck %s --check-prefix=CHECK-OPTNONE-ALL
+; RUN: opt < %s -S -passes=verify,forceattrs,verify -force-attribute minsize | FileCheck %s --check-prefix=CHECK-MINSIZE-ALL
+; RUN: opt < %s -S -passes=verify,forceattrs,verify -force-attribute optdebug | FileCheck %s --check-prefix=CHECK-OPTDEBUG-ALL
 
 ; CHECK-CONTROL: define void @foo() {
 ; CHECK-FOO: define void @foo() #0 {
@@ -36,43 +39,141 @@ define void @hoo() #1 {
   ret void
 }
 
+define void @zoo() #2 {
+  ret void
+}
+
+define void @bar() #3 {
+  ret void
+}
+
+define void @baz() #4 {
+  ret void
+}
+
+define void @qoo() #5 {
+  ret void
+}
+
 attributes #0 = { noinline }
 attributes #1 = { alwaysinline }
+attributes #2 = { optsize }
+attributes #3 = { minsize }
+attributes #4 = { optdebug }
+attributes #5 = { noinline optnone }
 
 ; CHECK-FOO: attributes #0 = { noinline }
 ; REMOVE-COLD: attributes #0 = { noinline }
 ; ADD-COLD-REMOVE-NOINLINE: attributes #0 = { cold }
 
 ; When passing an attribute without specifying a function, the attribute
-; should be added to all functions in the module.
+; should be added to all compatible functions in the module.
 ; CHECK-ADD-ALL: define void @foo() #0 {
 ; CHECK-ADD-ALL: define void @goo() #1 {
 ; CHECK-ADD-ALL: define void @hoo() #2 {
-; CHECK-ADD-ALL: attributes #0 = { optsize }
-; CHECK-ADD-ALL: attributes #1 = { noinline optsize }
-; CHECK-ADD-ALL: attributes #2 = { alwaysinline optsize }
+; CHECK-ADD-ALL: define void @zoo() #0 {
+; CHECK-ADD-ALL: define void @bar() #3 {
+; CHECK-ADD-ALL: define void @baz() #4 {
+; CHECK-ADD-ALL: define void @qoo() #5 {
+; CHECK-ADD-ALL-DAG: attributes #0 = { optsize }
+; CHECK-ADD-ALL-DAG: attributes #1 = { noinline optsize }
+; CHECK-ADD-ALL-DAG: attributes #2 = { alwaysinline optsize }
+; CHECK-ADD-ALL-DAG: attributes #3 = { minsize optsize }
+; CHECK-ADD-ALL-DAG: attributes #4 = { optdebug }
+; CHECK-ADD-ALL-DAG: attributes #5 = { noinline optnone }
 
 ; When passing an attribute to be removed without specifying a function,
 ; the attribute should be removed from all functions in the module that
-; have it.
+; have it, unless doing so would create invalid IR (e.g. `optnone` requires
+; `noinline`).
 ; CHECK-REMOVE-ALL: define void @foo() {
 ; CHECK-REMOVE-ALL: define void @goo() {
 ; CHECK-REMOVE-ALL: define void @hoo() #0 {
-; CHECK-REMOVE-ALL: attributes #0 = { alwaysinline }
+; CHECK-REMOVE-ALL: define void @zoo() #1 {
+; CHECK-REMOVE-ALL: define void @bar() #2 {
+; CHECK-REMOVE-ALL: define void @baz() #3 {
+; CHECK-REMOVE-ALL: define void @qoo() #4 {
+; CHECK-REMOVE-ALL-DAG: attributes #0 = { alwaysinline }
+; CHECK-REMOVE-ALL-DAG: attributes #1 = { optsize }
+; CHECK-REMOVE-ALL-DAG: attributes #2 = { minsize }
+; CHECK-REMOVE-ALL-DAG: attributes #3 = { optdebug }
+; CHECK-REMOVE-ALL-DAG: attributes #4 = { noinline optnone }
 
 ; When forcing alwaysinline on all functions, it should not be added to
-; functions that already have noinline (would produce invalid IR).
+; functions that already have noinline or optnone (would produce invalid IR).
 ; CHECK-ALWAYSINLINE-ALL: define void @foo() #0 {
 ; CHECK-ALWAYSINLINE-ALL: define void @goo() #1 {
 ; CHECK-ALWAYSINLINE-ALL: define void @hoo() #0 {
+; CHECK-ALWAYSINLINE-ALL: define void @zoo() #2 {
+; CHECK-ALWAYSINLINE-ALL: define void @bar() #3 {
+; CHECK-ALWAYSINLINE-ALL: define void @baz() #4 {
+; CHECK-ALWAYSINLINE-ALL: define void @qoo() #5 {
 ; CHECK-ALWAYSINLINE-ALL-DAG: attributes #0 = { alwaysinline }
 ; CHECK-ALWAYSINLINE-ALL-DAG: attributes #1 = { noinline }
+; CHECK-ALWAYSINLINE-ALL-DAG: attributes #2 = { alwaysinline optsize }
+; CHECK-ALWAYSINLINE-ALL-DAG: attributes #3 = { alwaysinline minsize }
+; CHECK-ALWAYSINLINE-ALL-DAG: attributes #4 = { alwaysinline optdebug }
+; CHECK-ALWAYSINLINE-ALL-DAG: attributes #5 = { noinline optnone }
 
 ; When forcing noinline on all functions, it should not be added to
 ; functions that already have alwaysinline (would produce invalid IR).
 ; CHECK-NOINLINE-ALL: define void @foo() #0 {
 ; CHECK-NOINLINE-ALL: define void @goo() #0 {
 ; CHECK-NOINLINE-ALL: define void @hoo() #1 {
+; CHECK-NOINLINE-ALL: define void @zoo() #2 {
+; CHECK-NOINLINE-ALL: define void @bar() #3 {
+; CHECK-NOINLINE-ALL: define void @baz() #4 {
+; CHECK-NOINLINE-ALL: define void @qoo() #5 {
 ; CHECK-NOINLINE-ALL-DAG: attributes #0 = { noinline }
 ; CHECK-NOINLINE-ALL-DAG: attributes #1 = { alwaysinline }
+; CHECK-NOINLINE-ALL-DAG: attributes #2 = { noinline optsize }
+; CHECK-NOINLINE-ALL-DAG: attributes #3 = { minsize noinline }
+; CHECK-NOINLINE-ALL-DAG: attributes #4 = { noinline optdebug }
+; CHECK-NOINLINE-ALL-DAG: attributes #5 = { noinline optnone }
+
+; When forcing optnone on all functions, it should not be added to functions
+; that already have alwaysinline, optsize, minsize, or optdebug.
+; CHECK-OPTNONE-ALL: define void @foo() #0 {
+; CHECK-OPTNONE-ALL: define void @goo() #0 {
+; CHECK-OPTNONE-ALL: define void @hoo() #1 {
+; CHECK-OPTNONE-ALL: define void @zoo() #2 {
+; CHECK-OPTNONE-ALL: define void @bar() #3 {
+; CHECK-OPTNONE-ALL: define void @baz() #4 {
+; CHECK-OPTNONE-ALL: define void @qoo() #0 {
+; CHECK-OPTNONE-ALL-DAG: attributes #0 = { noinline optnone }
+; CHECK-OPTNONE-ALL-DAG: attributes #1 = { alwaysinline }
+; CHECK-OPTNONE-ALL-DAG: attributes #2 = { optsize }
+; CHECK-OPTNONE-ALL-DAG: attributes #3 = { minsize }
+; CHECK-OPTNONE-ALL-DAG: attributes #4 = { optdebug }
+
+; When forcing minsize on all functions, it should not be added to functions
+; that already have optnone or optdebug.
+; CHECK-MINSIZE-ALL: define void @foo() #0 {
+; CHECK-MINSIZE-ALL: define void @goo() #1 {
+; CHECK-MINSIZE-ALL: define void @hoo() #2 {
+; CHECK-MINSIZE-ALL: define void @zoo() #3 {
+; CHECK-MINSIZE-ALL: define void @bar() #0 {
+; CHECK-MINSIZE-ALL: define void @baz() #4 {
+; CHECK-MINSIZE-ALL: define void @qoo() #5 {
+; CHECK-MINSIZE-ALL-DAG: attributes #0 = { minsize }
+; CHECK-MINSIZE-ALL-DAG: attributes #1 = { minsize noinline }
+; CHECK-MINSIZE-ALL-DAG: attributes #2 = { alwaysinline minsize }
+; CHECK-MINSIZE-ALL-DAG: attributes #3 = { minsize optsize }
+; CHECK-MINSIZE-ALL-DAG: attributes #4 = { optdebug }
+; CHECK-MINSIZE-ALL-DAG: attributes #5 = { noinline optnone }
 
+; When forcing optdebug on all functions, it should not be added to functions
+; that already have optnone, minsize, or optsize.
+; CHECK-OPTDEBUG-ALL: define void @foo() #0 {
+; CHECK-OPTDEBUG-ALL: define void @goo() #1 {
+; CHECK-OPTDEBUG-ALL: define void @hoo() #2 {
+; CHECK-OPTDEBUG-ALL: define void @zoo() #3 {
+; CHECK-OPTDEBUG-ALL: define void @bar() #4 {
+; CHECK-OPTDEBUG-ALL: define void @baz() #0 {
+; CHECK-OPTDEBUG-ALL: define void @qoo() #5 {
+; CHECK-OPTDEBUG-ALL-DAG: attributes #0 = { optdebug }
+; CHECK-OPTDEBUG-ALL-DAG: attributes #1 = { noinline optdebug }
+; CHECK-OPTDEBUG-ALL-DAG: attributes #2 = { alwaysinline optdebug }
+; CHECK-OPTDEBUG-ALL-DAG: attributes #3 = { optsize }
+; CHECK-OPTDEBUG-ALL-DAG: attributes #4 = { minsize }
+; CHECK-OPTDEBUG-ALL-DAG: attributes #5 = { noinline optnone }
diff --git a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
index c99d979be472b..71bddbfb9742f 100644
--- a/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_align_and_noundef.ll
@@ -421,6 +421,43 @@ define i8 @caller16_not_intersecting_ranges() {
   ret i8 %r
 }
 
+define i8 @callee_range_multi_ret(i1 %c) {
+; CHECK-LABEL: define i8 @callee_range_multi_ret(i1 %c) {
+; CHECK-NEXT:    br i1 %c, label %bb1, label %bb2
+; CHECK:       bb1:
+; CHECK-NEXT:    [[R1:%.*]] = call range(i8 0, 20) i8 @val8()
+; CHECK-NEXT:    ret i8 [[R1]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[R2:%.*]] = call range(i8 10, 30) i8 @val8()
+; CHECK-NEXT:    ret i8 [[R2]]
+;
+  br i1 %c, label %bb1, label %bb2
+
+bb1:
+  %r1 = call range(i8 0, 20) i8 @val8()
+  ret i8 %r1
+
+bb2:
+  %r2 = call range(i8 10, 30) i8 @val8()
+  ret i8 %r2
+}
+
+define i8 @caller_range_multi_ret_okay_intersect(i1 %c) {
+; CHECK-LABEL: define i8 @caller_range_multi_ret_okay_intersect(i1 %c) {
+; CHECK-NEXT:    br i1 %c, label %bb1.i, label %bb2.i
+; CHECK:       bb1.i:
+; CHECK-NEXT:    [[R1_I:%.*]] = call range(i8 5, 20) i8 @val8()
+; CHECK-NEXT:    br label %callee_range_multi_ret.exit
+; CHECK:       bb2.i:
+; CHECK-NEXT:    [[R2_I:%.*]] = call range(i8 10, 25) i8 @val8()
+; CHECK-NEXT:    br label %callee_range_multi_ret.exit
+; CHECK:       callee_range_multi_ret.exit:
+; CHECK-NEXT:    [[R_I:%.*]] = phi i8 [ [[R1_I]], %bb1.i ], [ [[R2_I]], %bb2.i ]
+; CHECK-NEXT:    ret i8 [[R_I]]
+;
+  %r = call range(i8 5, 25) i8 @callee_range_multi_ret(i1 %c)
+  ret i8 %r
+}
 
 define ptr @caller_bad_ret_prop(ptr %p1, ptr %p2, i64 %x, ptr %other) {
 ; CHECK-LABEL: define ptr @caller_bad_ret_prop
diff --git a/llvm/test/Transforms/Inline/ret_attr_nofpclass.ll b/llvm/test/Transforms/Inline/ret_attr_nofpclass.ll
index 16ad718d60349..5af3aaaa72c3d 100644
--- a/llvm/test/Transforms/Inline/ret_attr_nofpclass.ll
+++ b/llvm/test/Transforms/Inline/ret_attr_nofpclass.ll
@@ -55,3 +55,42 @@ define float @caller_not_intersecting_nofpclass() {
   ret float %r
 }
 
+define float @callee_nofpclass_inf_nan_multi_ret(i1 %c) {
+; CHECK-LABEL: define float @callee_nofpclass_inf_nan_multi_ret(i1 %c) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c, label %bb1, label %bb2
+; CHECK:       bb1:
+; CHECK-NEXT:    [[R1:%.*]] = call nofpclass(nan) float @use.val(float 1.000000e+00)
+; CHECK-NEXT:    ret float [[R1]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[R2:%.*]] = call nofpclass(inf) float @use.val(float 2.000000e+00)
+; CHECK-NEXT:    ret float [[R2]]
+;
+entry:
+  br i1 %c, label %bb1, label %bb2
+
+bb1:
+  %r1 = call nofpclass(nan) float @use.val(float 1.0)
+  ret float %r1
+
+bb2:
+  %r2 = call nofpclass(inf) float @use.val(float 2.0)
+  ret float %r2
+}
+
+define float @caller_okay_intersect_nofpclass_multi_ret(i1 %c) {
+; CHECK-LABEL: define float @caller_okay_intersect_nofpclass_multi_ret(
+; CHECK-NEXT:    br i1 %c, label %bb1.i, label %bb2.i
+; CHECK:       bb1.i:
+; CHECK-NEXT:    [[R1_I:%.*]] = call nofpclass(nan zero) float @use.val(float 1.000000e+00)
+; CHECK-NEXT:    br label %callee_nofpclass_inf_nan_multi_ret.exit
+; CHECK:       bb2.i:
+; CHECK-NEXT:    [[R2_I:%.*]] = call nofpclass(inf zero) float @use.val(float 2.000000e+00)
+; CHECK-NEXT:    br label %callee_nofpclass_inf_nan_multi_ret.exit
+; CHECK:       callee_nofpclass_inf_nan_multi_ret.exit:
+; CHECK-NEXT:    [[R:%.*]] = phi float [ [[R1_I]], %bb1.i ], [ [[R2_I]], %bb2.i ]
+; CHECK-NEXT:    ret float [[R]]
+;
+  %r = call nofpclass(zero) float @callee_nofpclass_inf_nan_multi_ret(i1 %c)
+  ret float %r
+}
diff --git a/llvm/test/Transforms/InstCombine/fadd.ll b/llvm/test/Transforms/InstCombine/fadd.ll
index 094137b7f3ddf..d412e7fa75255 100644
--- a/llvm/test/Transforms/InstCombine/fadd.ll
+++ b/llvm/test/Transforms/InstCombine/fadd.ll
@@ -1146,3 +1146,74 @@ define float @fadd_reduce_sqr_sum_varB2_invalid3(float %a, float %b) {
 }
 
 declare void @fake_func(float)
+
+; Mask 240 = fcPosSubnormal (128) | fcPosZero (64) | fcNegZero (32) | fcNegSubnormal (16)
+define i1 @fadd_pos_normal_not_zero_or_subnormal(float %a) {
+; CHECK-LABEL: @fadd_pos_normal_not_zero_or_subnormal(
+; CHECK-NEXT:    ret i1 false
+;
+  %abs = call float @llvm.fabs.f32(float %a)
+  %add = fadd float %abs, 1.0
+  %class = call i1 @llvm.is.fpclass.f32(float %add, i32 240)
+  ret i1 %class
+}
+
+define <2 x i1> @fadd_vector_pos_normal_not_zero_or_subnormal(<2 x float> %a) {
+; CHECK-LABEL: @fadd_vector_pos_normal_not_zero_or_subnormal(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %add = fadd <2 x float> %abs, splat (float 1.0)
+  %class = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %add, i32 240)
+  ret <2 x i1> %class
+}
+
+define i1 @fadd_neg_normal_not_zero_or_subnormal(float %a) {
+; CHECK-LABEL: @fadd_neg_normal_not_zero_or_subnormal(
+; CHECK-NEXT:    ret i1 false
+;
+  %abs = call float @llvm.fabs.f32(float %a)
+  %neg_abs = fneg float %abs
+  %add = fadd float %neg_abs, -1.0
+  %class = call i1 @llvm.is.fpclass.f32(float %add, i32 240)
+  ret i1 %class
+}
+
+define <2 x i1> @fadd_vector_neg_normal_not_zero_or_subnormal(<2 x float> %a) {
+; CHECK-LABEL: @fadd_vector_neg_normal_not_zero_or_subnormal(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %abs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
+  %neg_abs = fneg <2 x float> %abs
+  %add = fadd <2 x float> %neg_abs, splat (float -1.0)
+  %class = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %add, i32 240)
+  ret <2 x i1> %class
+}
+
+; Negative case: LHS is NOT known non-negative.
+; If %a is -1.0, the result is +0.0.
+define i1 @fadd_unknown_lhs_normal_rhs(float %a) {
+; CHECK-LABEL: @fadd_unknown_lhs_normal_rhs(
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[A:%.*]], 1.000000e+00
+; CHECK-NEXT:    [[CLASS:%.*]] = call i1 @llvm.is.fpclass.f32(float [[ADD]], i32 208)
+; CHECK-NEXT:    ret i1 [[CLASS]]
+;
+  %add = fadd float %a, 1.0
+  %class = call i1 @llvm.is.fpclass.f32(float %add, i32 208)
+  ret i1 %class
+}
+
+; Negative case: RHS is a negative constant.
+; If %a is 1.0, the result is +0.0.
+define i1 @fadd_pos_lhs_negative_rhs(float %a) {
+; CHECK-LABEL: @fadd_pos_lhs_negative_rhs(
+; CHECK-NEXT:    [[ABS:%.*]] = call float @llvm.fabs.f32(float [[A:%.*]])
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[ABS]], -1.000000e+00
+; CHECK-NEXT:    [[CLASS:%.*]] = call i1 @llvm.is.fpclass.f32(float [[ADD]], i32 208)
+; CHECK-NEXT:    ret i1 [[CLASS]]
+;
+  %abs = call float @llvm.fabs.f32(float %a)
+  %add = fadd float %abs, -1.0
+  %class = call i1 @llvm.is.fpclass.f32(float %add, i32 208)
+  ret i1 %class
+}
diff --git a/llvm/test/Transforms/InstCombine/frem-inf.ll b/llvm/test/Transforms/InstCombine/frem-inf.ll
new file mode 100644
index 0000000000000..a2ef02cd8aeab
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/frem-inf.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+define i1 @test_frem_is_fpclass_inf(double %a, double %b) {
+; CHECK-LABEL: define i1 @test_frem_is_fpclass_inf(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %rem = frem double %a, %b
+  ; 516 checks for +inf (512) or -inf (4)
+  %is.inf = call i1 @llvm.is.fpclass.f64(double %rem, i32 516)
+  ret i1 %is.inf
+}
+
+define i1 @test_frem_fcmp_inf(double %a, double %b) {
+; CHECK-LABEL: define i1 @test_frem_fcmp_inf(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    ret i1 false
+;
+entry:
+  %rem = frem double %a, %b
+  %is.inf = fcmp oeq double %rem, 0x7FF0000000000000
+  ret i1 %is.inf
+}
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 439b59946fac1..fadcbfeddac4b 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -2863,3 +2863,55 @@ entry:
   %and = and i32 %sub, 127
   ret i32 %and
 }
+
+define i32 @sub_const_or_disjoint(i32 %x) {
+; CHECK-LABEL: @sub_const_or_disjoint(
+; CHECK-NEXT:    [[R:%.*]] = sub i32 90, [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %a = or disjoint i32 %x, 10
+  %r = sub i32 100, %a
+  ret i32 %r
+}
+
+define i32 @sub_nsw_const_or_disjoint(i32 %x) {
+; CHECK-LABEL: @sub_nsw_const_or_disjoint(
+; CHECK-NEXT:    [[R:%.*]] = sub nsw i32 90, [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %a = or disjoint i32 %x, 10
+  %r = sub nsw i32 100, %a
+  ret i32 %r
+}
+
+define i32 @sub_nuw_const_or_disjoint(i32 %x) {
+; CHECK-LABEL: @sub_nuw_const_or_disjoint(
+; CHECK-NEXT:    [[R:%.*]] = sub nuw i32 100, [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %a = or disjoint i32 %x, 0
+  %r = sub nuw i32 100, %a
+  ret i32 %r
+}
+
+define <2 x i32> @sub_const_or_disjoint_vec(<2 x i32> %x) {
+; CHECK-LABEL: @sub_const_or_disjoint_vec(
+; CHECK-NEXT:    [[R:%.*]] = sub <2 x i32> splat (i32 90), [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %a = or disjoint <2 x i32> %x, splat (i32 10)
+  %r = sub <2 x i32> splat (i32 100), %a
+  ret <2 x i32> %r
+}
+
+; negative test
+define i32 @sub_const_or_no_disjoint(i32 %x) {
+; CHECK-LABEL: @sub_const_or_no_disjoint(
+; CHECK-NEXT:    [[A:%.*]] = or i32 [[X:%.*]], 10
+; CHECK-NEXT:    [[R:%.*]] = sub i32 100, [[A]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %a = or i32 %x, 10
+  %r = sub i32 100, %a
+  ret i32 %r
+}
diff --git a/llvm/test/Transforms/JumpThreading/thread-prob-8.ll b/llvm/test/Transforms/JumpThreading/thread-prob-8.ll
index b63c789515966..d23121ff8a928 100644
--- a/llvm/test/Transforms/JumpThreading/thread-prob-8.ll
+++ b/llvm/test/Transforms/JumpThreading/thread-prob-8.ll
@@ -8,7 +8,6 @@
 ; prob[L0->2] + prob[L0->3]
 
 ; CHECK: Computing probabilities for entry
-; CHECK: eraseBlock L0
 ; CHECK-NOT: set edge L0 -> 0 successor probability to 0x12492492 / 0x80000000 = 14.29%
 ; CHECK-NOT: set edge L0 -> 1 successor probability to 0x24924925 / 0x80000000 = 28.57%
 ; CHECK-NOT: set edge L0 -> 2 successor probability to 0x24924925 / 0x80000000 = 28.57%
diff --git a/llvm/test/Transforms/LoopSimplifyCFG/mssa_term.ll b/llvm/test/Transforms/LoopSimplifyCFG/mssa_term.ll
new file mode 100644
index 0000000000000..a2936727ea6ad
--- /dev/null
+++ b/llvm/test/Transforms/LoopSimplifyCFG/mssa_term.ll
@@ -0,0 +1,46 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes="loop-mssa(loop-simplifycfg,simple-loop-unswitch)" < %s | FileCheck %s
+
+; Check that IR is valid when MemorySSA is updated during MergeBlockIntoPredecessor.
+
+define i32 @f1(i1 %cond) personality ptr null {
+; CHECK-LABEL: define i32 @f1(
+; CHECK-SAME: i1 [[COND:%.*]]) personality ptr null {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[COND]], label %[[ENTRY_SPLIT:.*]], label %[[COMMON_RET_LOOPEXIT:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
+; CHECK-NEXT:    br label %[[FOR_COND:.*]]
+; CHECK:       [[FOR_COND]]:
+; CHECK-NEXT:    [[CALL26:%.*]] = invoke i32 @f2(ptr null, ptr null, ptr null)
+; CHECK-NEXT:            to label %[[FOR_COND]] unwind label %[[LPAD24:.*]]
+; CHECK:       [[COMMON_RET_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[COMMON_RET:.*]]
+; CHECK:       [[COMMON_RET]]:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ 0, %[[LPAD24]] ], [ 0, %[[COMMON_RET_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       [[LPAD24]]:
+; CHECK-NEXT:    [[LPAD:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end19, %entry
+  br i1 %cond, label %if.end19, label %common.ret
+
+common.ret:                                       ; preds = %lpad24, %for.cond
+  %common.ret.op = phi i32 [ 0, %lpad24 ], [ 0, %for.cond ]
+  ret i32 %common.ret.op
+
+if.end19:                                         ; preds = %for.cond
+  %call26 = invoke i32 @f2(ptr null, ptr null, ptr null)
+  to label %for.cond unwind label %lpad24
+
+lpad24:                                           ; preds = %if.end19
+  %lpad = landingpad { ptr, i32 }
+  cleanup
+  br label %common.ret
+}
+
+declare i32 @f2(ptr, ptr, ptr)
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment-fold-tail.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment-fold-tail.ll
index 566a3941aedcb..51760db75c033 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment-fold-tail.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment-fold-tail.ll
@@ -85,17 +85,16 @@ define i32 @non_speculatable_find_last_reduction(ptr noalias %a, ptr noalias %b,
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT2]], %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = phi <vscale x 4 x i1> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP9]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = freeze <vscale x 4 x i1> [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = freeze <vscale x 4 x i1> [[TMP8]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP11]])
-; CHECK-NEXT:    [[TMP13]] = select i1 [[TMP12]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP13]] = select i1 [[TMP12]], <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i1> [[TMP2]]
 ; CHECK-NEXT:    [[TMP14]] = select i1 [[TMP12]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]], <vscale x 4 x i32> [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
index 4eced1640bd14..ae1f7829449da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll
@@ -46,7 +46,6 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -84,7 +83,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i8 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX18:%.*]] = phi i8 [ [[RDX_SELECT15]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
index 44636222a8648..efa64c661ebc0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-factors.ll
@@ -446,6 +446,8 @@ define void @trip_count_based_on_ptrtoint(i64 %x) "target-cpu"="apple-m1" {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 4
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[TMP12]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -465,8 +467,6 @@ define void @trip_count_based_on_ptrtoint(i64 %x) "target-cpu"="apple-m1" {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 4
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF11]]
 ; CHECK:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
index eea496303a206..680dfb83271cf 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll
@@ -39,7 +39,6 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END4:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000
 ; CHECK-NEXT:    br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -65,7 +64,7 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) {
 ; CHECK-NEXT:    br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL5:%.*]] = phi ptr [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP0]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PTR_START_1]], [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -212,7 +211,6 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 [[START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
@@ -239,7 +237,7 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) {
 ; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N15]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
@@ -457,3 +455,103 @@ loop:
 exit:
   ret void
 }
+
+; Test that epilogue vectorization correctly handles loops where identical
+; induction variables are CSE'd into a single resume phi, avoiding a count
+; mismatch between ResumeValues and the epilogue's scalar preheader phis.
+define i32 @cse_multiple_ivs_with_scalar_resume(ptr %src, i64 %N) {
+; CHECK-LABEL: @cse_multiple_ivs_with_scalar_resume(
+; CHECK-NEXT:  iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <2 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = sext <2 x i16> [[WIDE_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <2 x i16> [[WIDE_LOAD3]] to <2 x i32>
+; CHECK-NEXT:    [[TMP5]] = or <2 x i32> [[TMP3]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP6]] = or <2 x i32> [[TMP4]], [[VEC_PHI2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <2 x i32> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <2 x i32> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i16>, ptr [[TMP10]], align 2
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <2 x i16> [[WIDE_LOAD8]] to <2 x i32>
+; CHECK-NEXT:    [[TMP12]] = or <2 x i32> [[TMP11]], [[VEC_PHI7]]
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}}
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[TMP12]])
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i32 [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP8]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV1_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV2_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[RDX_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i16, ptr [[SRC]], i64 [[IV1]]
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[GEP]], align 2
+; CHECK-NEXT:    [[EXT:%.*]] = sext i16 [[VAL]] to i32
+; CHECK-NEXT:    [[RDX_NEXT]] = or i32 [[EXT]], [[RDX]]
+; CHECK-NEXT:    [[IV1_NEXT]] = add i64 [[IV1]], 1
+; CHECK-NEXT:    [[IV2_NEXT]] = add i64 [[IV2]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i64 [[IV1]], [[N]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], {{!llvm.loop ![0-9]+}}
+; CHECK:       exit:
+; CHECK-NEXT:    [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[LOOP]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ [[TMP14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RDX_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv1 = phi i64 [ 0, %entry ], [ %iv1.next, %loop ]
+  %iv2 = phi i64 [ 0, %entry ], [ %iv2.next, %loop ]
+  %rdx = phi i32 [ 0, %entry ], [ %rdx.next, %loop ]
+  %gep = getelementptr i16, ptr %src, i64 %iv1
+  %val = load i16, ptr %gep, align 2
+  %ext = sext i16 %val to i32
+  %rdx.next = or i32 %ext, %rdx
+  %iv1.next = add i64 %iv1, 1
+  %iv2.next = add i64 %iv2, 1
+  %cond = icmp eq i64 %iv1, %N
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret i32 %rdx.next
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
index feb0175e75542..dfa64041cc05b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/f128-fmuladd-reduction.ll
@@ -13,6 +13,10 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 8
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP7]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -54,10 +58,6 @@ define double @fp128_fmuladd_reduction(ptr %start0, ptr %start1, ptr %end0, ptr
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[START0]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 8
-; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[START1]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
index 0a62ac9804524..38047541db5d9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll
@@ -59,6 +59,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; COST1:       [[VECTOR_PH]]:
 ; COST1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[START]], 32
 ; COST1-NEXT:    [[N_VEC:%.*]] = sub i64 [[START]], [[N_MOD_VF]]
+; COST1-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
+; COST1-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; COST1-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; COST1:       [[VECTOR_BODY]]:
 ; COST1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -73,8 +75,6 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; COST1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[START]], [[N_VEC]]
 ; COST1-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; COST1:       [[VEC_EPILOG_ITER_CHECK]]:
-; COST1-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
-; COST1-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; COST1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; COST1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
 ; COST1:       [[VEC_EPILOG_PH]]:
@@ -112,6 +112,8 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; COST10:       [[VECTOR_PH]]:
 ; COST10-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[START]], 16
 ; COST10-NEXT:    [[N_VEC:%.*]] = sub i64 [[START]], [[N_MOD_VF]]
+; COST10-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
+; COST10-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; COST10-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; COST10:       [[VECTOR_BODY]]:
 ; COST10-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -124,8 +126,6 @@ define void @test_iv_cost(ptr %ptr.start, i8 %a, i64 %b) {
 ; COST10-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[START]], [[N_VEC]]
 ; COST10-NEXT:    br i1 [[CMP_N]], [[EXIT_LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; COST10:       [[VEC_EPILOG_ITER_CHECK]]:
-; COST10-NEXT:    [[IND_END:%.*]] = sub i64 [[START]], [[N_VEC]]
-; COST10-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PTR_START]], i64 [[N_VEC]]
 ; COST10-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; COST10-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
 ; COST10:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index aba18314acf41..2e640e2d22658 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -582,6 +582,8 @@ define void at sext_sub_nsw_for_address(ptr %base, i64 %n, ptr %src) #0 {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IND_END32:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    [[DOTCAST:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul i32 [[DOTCAST]], 2
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -663,10 +665,6 @@ define void at sext_sub_nsw_for_address(ptr %base, i64 %n, ptr %src) #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END32:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    [[DOTCAST33:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END34:%.*]] = mul i32 [[DOTCAST33]], 2
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF23:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -711,7 +709,7 @@ define void at sext_sub_nsw_for_address(ptr %base, i64 %n, ptr %src) #0 {
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL29:%.*]] = phi i64 [ [[TMP84]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_RESUME_VAL30:%.*]] = phi i64 [ [[TMP85]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END32]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[N]], %[[VECTOR_MEMCHECK]] ], [ [[N]], %[[VECTOR_SCEVCHECK]] ], [ [[N]], %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL31:%.*]] = phi i32 [ [[TMP86]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END34]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL31:%.*]] = phi i32 [ [[TMP86]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP26]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL29]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -771,6 +769,8 @@ define i64 @live_out_extract_from_ptr_iv_increment(i64 %count, ptr %start, ptr n
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP87:%.*]] = mul i64 [[N_VEC]], 3
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP87]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -882,8 +882,6 @@ define i64 @live_out_extract_from_ptr_iv_increment(i64 %count, ptr %start, ptr n
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP87:%.*]] = mul i64 [[N_VEC]], 3
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP87]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-runtime-checks.ll
index 67826e5cb9f96..5e6834c093d8b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-runtime-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-runtime-checks.ll
@@ -30,6 +30,8 @@ define void @interleave_groups_separated_by_offset(ptr %A, i64 %offset) {
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
 ; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A]], i64 1984
+; CHECK-NEXT:    [[IND_END17:%.*]] = getelementptr i8, ptr [[A_OFFSET]], i64 1984
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -45,9 +47,7 @@ define void @interleave_groups_separated_by_offset(ptr %A, i64 %offset) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A]], i64 1984
-; CHECK-NEXT:    [[IND_END17:%.*]] = getelementptr i8, ptr [[A_OFFSET]], i64 1984
-; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 1984
@@ -60,7 +60,7 @@ define void @interleave_groups_separated_by_offset(ptr %A, i64 %offset) {
 ; CHECK-NEXT:    store <16 x i8> zeroinitializer, ptr [[NEXT_GEP13]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX12]], 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT14]], 992
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 3eb42845bec4a..e957c1decc838 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -27,15 +27,21 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX2]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
@@ -51,12 +57,6 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END8:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END10:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END13:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
@@ -153,13 +153,17 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST6:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP1]], align 2
@@ -175,10 +179,6 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[DOTCAST6:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST6]]
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
index c340cfc9ad6cc..99db3ee324c27 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll
@@ -70,6 +70,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 4
 ; CHECK-VS1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]]
 ; CHECK-VS1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-VS1-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-VS1-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -87,7 +88,6 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-VS1:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-VS1-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS1-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-VS1-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK-VS1:       [[VEC_EPILOG_PH]]:
@@ -163,6 +163,7 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[TMP17:%.*]] = shl nuw i64 [[TMP16]], 3
 ; CHECK-VS2-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], [[TMP17]]
 ; CHECK-VS2-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-VS2-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i8> poison, i8 [[CONV]], i64 0
 ; CHECK-VS2-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
 ; CHECK-VS2-NEXT:    br label %[[VECTOR_BODY:.*]]
@@ -180,7 +181,6 @@ define void @low_vf_ic_is_better(ptr nocapture noundef %p, i32 %tc, i16 noundef
 ; CHECK-VS2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    br i1 [[CMP_N]], label %[[WHILE_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-VS2:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-VS2-NEXT:    [[IND_END4:%.*]] = add i64 [[TMP0]], [[N_VEC]]
 ; CHECK-VS2-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-VS2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK-VS2:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll
index 22696d0b297d9..0066bb76ea834 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/neon-inloop-reductions.ll
@@ -11,6 +11,7 @@ define i32 @mul_used_outside_vpexpression(ptr %src.0, ptr %src.1) {
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
 ; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC_0]], i64 96
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC_1]], i64 1
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
@@ -35,7 +36,6 @@ define i32 @mul_used_outside_vpexpression(ptr %src.0, ptr %src.1) {
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 false, label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC_0]], i64 96
 ; CHECK-NEXT:    br i1 false, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 96, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 5c2c67337625a..b1b00bdf84d83 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -70,6 +70,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -91,7 +92,6 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF5:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll
index a876b4553bcf7..766ce9ffce2e1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub-epilogue-vec.ll
@@ -143,7 +143,7 @@ define i32 @sub_reduction(i32 %startval, ptr %src1, ptr %src2) #0 {
 ; CHECK-PARTIAL-RED-EPI-NEXT:    br i1 false, label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-PARTIAL-RED-EPI:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-PARTIAL-RED-EPI-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-PARTIAL-RED-EPI-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi i32 [ [[TMP18]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[STARTVAL]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[ITER_CHECK]] ]
+; CHECK-PARTIAL-RED-EPI-NEXT:    [[BC_MERGE_RDX8:%.*]] = phi i32 [ [[TMP18]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[STARTVAL]], %[[ITER_CHECK]] ]
 ; CHECK-PARTIAL-RED-EPI-NEXT:    br label %[[LOOP:.*]]
 ; CHECK-PARTIAL-RED-EPI:       [[LOOP]]:
 ; CHECK-PARTIAL-RED-EPI-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index 83e4ca74e56fe..afd0fe4c9b39a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -349,10 +349,11 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP4]]
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP6]], align 1
@@ -363,7 +364,6 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7]]
 ; CHECK:       vec.epilog.ph:
@@ -399,10 +399,11 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-VF8-NEXT:    [[N_MOD_VF:%.*]] = urem i64 10000, [[TMP3]]
 ; CHECK-VF8-NEXT:    [[N_VEC:%.*]] = sub i64 10000, [[N_MOD_VF]]
+; CHECK-VF8-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[N_VEC]]
 ; CHECK-VF8-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-VF8:       vector.body:
 ; CHECK-VF8-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF8-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START:%.*]], i64 [[INDEX]]
+; CHECK-VF8-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
 ; CHECK-VF8-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP4]]
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[NEXT_GEP]], align 1
 ; CHECK-VF8-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP6]], align 1
@@ -413,7 +414,6 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK-VF8:       vec.epilog.iter.check:
-; CHECK-VF8-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-VF8-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-VF8-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK-VF8:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index 2896618cd853a..061132e2abdbb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -387,10 +387,9 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr double, ptr [[A]], i64 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[GEP_A]], align 8
 ; CHECK-NEXT:    [[DIV:%.*]] = fmul double [[L]], 5.000000e+00
@@ -543,6 +542,10 @@ define void @test_interleave_group_epilogue_with_preheader_phi(ptr %src, ptr %ds
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST_PRE]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[IND_END16:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -565,10 +568,6 @@ define void @test_interleave_group_epilogue_with_preheader_phi(ptr %src, ptr %ds
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST_PRE]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[IND_END16:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF7]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -585,11 +584,10 @@ define void @test_interleave_group_epilogue_with_preheader_phi(ptr %src, ptr %ds
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[SRC]], %[[VECTOR_SCEVCHECK]] ], [ [[SRC]], %[[ITER_CHECK]] ], [ [[IND_END16]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[DST_PHI:%.*]] = phi ptr [ [[DST_NEXT:%.*]], %[[LOOP]] ], [ [[DST_PRE]], %[[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[SRC_PHI:%.*]] = phi ptr [ [[SRC_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[SRC_PHI:%.*]] = phi ptr [ [[SRC_NEXT:%.*]], %[[LOOP]] ], [ [[SRC]], %[[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    store double 1.000000e+00, ptr [[DST_PHI]], align 8
 ; CHECK-NEXT:    [[DST_IM:%.*]] = getelementptr i8, ptr [[DST_PHI]], i64 8
 ; CHECK-NEXT:    store double 1.000000e+00, ptr [[DST_IM]], align 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index fff318850bef6..3becb967ba109 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -254,6 +254,12 @@ define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst)
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP21]], 4
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -281,12 +287,6 @@ define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst)
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[UMAX]]
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF17:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -469,37 +469,27 @@ define void @shared_wide_load_with_cast_not_narrowed(ptr noalias %src, ptr noali
 ; CHECK-LABEL: define void @shared_wide_load_with_cast_not_narrowed(
 ; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SCALES:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP8]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [16 x i8], ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x double>, ptr [[TMP0]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[SCALES]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fpext <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x double>
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <vscale x 4 x double> [[TMP11]], [[TMP12]]
-; CHECK-NEXT:    [[TMP10:%.*]] = fmul <vscale x 4 x double> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[BROADCAST_SPLAT]] to <2 x double>
+; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[WIDE_LOAD]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [16 x i8], ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> [[TMP13]], <vscale x 4 x double> [[TMP10]])
-; CHECK-NEXT:    store <vscale x 8 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
index bba7d058d6637..0ef7bd035d8af 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-with-wide-ops-and-casts.ll
@@ -16,13 +16,13 @@ define void @test_2xi64_matching_zext_interleave_group(ptr noalias %dst, ptr %sr
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -88,13 +88,13 @@ define void @test_2xi64_matching_sext_interleave_group(ptr noalias %dst, ptr %sr
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP2:%.*]] = sext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
 ; VF2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP2]], ptr [[TMP3]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -234,14 +234,14 @@ define void @test_2xi64_matching_cast_add_interleave_group(ptr noalias %dst, ptr
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP2:%.*]] = zext <2 x i32> [[WIDE_LOAD]] to <2 x i64>
 ; VF2-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP2]], splat (i64 2)
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
@@ -390,14 +390,14 @@ define void @test_2xi64_add_cast_interleave_group(ptr noalias %dst, ptr %src) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[INDEX]]
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP1]], align 8
+; VF2-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i64 0
+; VF2-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; VF2-NEXT:    [[TMP2:%.*]] = add <2 x i32> [[WIDE_LOAD]], splat (i32 2)
 ; VF2-NEXT:    [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64>
 ; VF2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2-NEXT:    store <2 x i64> [[TMP3]], ptr [[TMP4]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
index 114ed5b0a4e5e..1e417a5588833 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll
@@ -149,6 +149,7 @@ define i32 @reverse_store_with_partial_reduction(ptr noalias %dst, ptr noalias %
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -194,7 +195,6 @@ define i32 @reverse_store_with_partial_reduction(ptr noalias %dst, ptr noalias %
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF8:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
index 5380658a84653..5fc88479ac42a 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/exit-branch-cost.ll
@@ -23,6 +23,7 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 24
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -142,7 +143,6 @@ define i1 @select_exit_cond(ptr %start, ptr %end, i64 %N) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index 5c494af1289ef..cb384eaeb64fb 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -253,6 +253,7 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; VF-TWO-CHECK:       [[VECTOR_PH]]:
 ; VF-TWO-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-TWO-CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; VF-TWO-CHECK-NEXT:    [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-TWO-CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF-TWO-CHECK:       [[VECTOR_BODY]]:
 ; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -316,7 +317,6 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; VF-TWO-CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; VF-TWO-CHECK-NEXT:    [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-TWO-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; VF-TWO-CHECK:       [[VEC_EPILOG_PH]]:
@@ -367,6 +367,7 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; VF-FOUR-CHECK:       [[VECTOR_PH]]:
 ; VF-FOUR-CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-FOUR-CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; VF-FOUR-CHECK-NEXT:    [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-FOUR-CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF-FOUR-CHECK:       [[VECTOR_BODY]]:
 ; VF-FOUR-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -430,7 +431,6 @@ define void @f2(ptr noalias %A, ptr noalias %B, i32 %n) {
 ; VF-FOUR-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; VF-FOUR-CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; VF-FOUR-CHECK-NEXT:    [[IND_END18:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-FOUR-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; VF-FOUR-CHECK:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll b/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll
index a270993075b4a..cf47da0cae2ea 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/conditional-scalar-assignment-vplan.ll
@@ -256,8 +256,8 @@ define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val
 ; CHECK-TF-NEXT:  <x1> vector loop: {
 ; CHECK-TF-NEXT:    vector.body:
 ; CHECK-TF-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK-TF-NEXT:      WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<%default_val>, vp<[[VP15:%[0-9]+]]>
-; CHECK-TF-NEXT:      WIDEN-PHI vp<[[VP5:%[0-9]+]]> = phi [ ir<false>, vector.ph ], [ vp<[[VP14:%[0-9]+]]>, vector.body ]
+; CHECK-TF-NEXT:      WIDEN-REDUCTION-PHI ir<%data.phi> = phi ir<%default_val>, vp<[[VP14:%[0-9]+]]>
+; CHECK-TF-NEXT:      WIDEN-PHI vp<[[VP5:%[0-9]+]]> = phi [ ir<false>, vector.ph ], [ vp<[[VP13:%[0-9]+]]>, vector.body ]
 ; CHECK-TF-NEXT:      vp<[[VP6:%[0-9]+]]> = SCALAR-STEPS vp<[[VP4]]>, ir<1>, vp<[[VP0]]>
 ; CHECK-TF-NEXT:      EMIT vp<[[VP7:%[0-9]+]]> = WIDEN-CANONICAL-INDUCTION vp<[[VP4]]>
 ; CHECK-TF-NEXT:      EMIT vp<[[VP8:%[0-9]+]]> = icmp ule vp<[[VP7]]>, vp<[[VP3]]>
@@ -269,10 +269,9 @@ define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val
 ; CHECK-TF-NEXT:      CLONE ir<%b.addr> = getelementptr ir<%b>, vp<[[VP6]]>
 ; CHECK-TF-NEXT:      vp<[[VP11:%[0-9]+]]> = vector-pointer ir<%b.addr>
 ; CHECK-TF-NEXT:      WIDEN ir<%ld.b> = load vp<[[VP11]]>, vp<[[VP10]]>
-; CHECK-TF-NEXT:      EMIT vp<[[VP12:%[0-9]+]]> = logical-and vp<[[VP8]]>, vp<[[VP10]]>
-; CHECK-TF-NEXT:      EMIT vp<[[VP13:%[0-9]+]]> = any-of vp<[[VP12]]>
-; CHECK-TF-NEXT:      EMIT vp<[[VP14]]> = select vp<[[VP13]]>, vp<[[VP12]]>, vp<[[VP5]]>
-; CHECK-TF-NEXT:      EMIT vp<[[VP15]]> = select vp<[[VP13]]>, ir<%ld.b>, ir<%data.phi>
+; CHECK-TF-NEXT:      EMIT vp<[[VP12:%[0-9]+]]> = any-of vp<[[VP10]]>
+; CHECK-TF-NEXT:      EMIT vp<[[VP13]]> = select vp<[[VP12]]>, vp<[[VP10]]>, vp<[[VP5]]>
+; CHECK-TF-NEXT:      EMIT vp<[[VP14]]> = select vp<[[VP12]]>, ir<%ld.b>, ir<%data.phi>
 ; CHECK-TF-NEXT:      EMIT vp<%index.next> = add vp<[[VP4]]>, vp<[[VP1]]>
 ; CHECK-TF-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2]]>
 ; CHECK-TF-NEXT:    No successors
@@ -280,11 +279,11 @@ define i32 @simple_csa_int_load(ptr noalias %a, ptr noalias %b, i32 %default_val
 ; CHECK-TF-NEXT:  Successor(s): middle.block
 ; CHECK-TF-EMPTY:
 ; CHECK-TF-NEXT:  middle.block:
-; CHECK-TF-NEXT:    EMIT vp<[[VP17:%[0-9]+]]> = extract-last-active ir<%default_val>, vp<[[VP15]]>, vp<[[VP14]]>
+; CHECK-TF-NEXT:    EMIT vp<[[VP16:%[0-9]+]]> = extract-last-active ir<%default_val>, vp<[[VP14]]>, vp<[[VP13]]>
 ; CHECK-TF-NEXT:  Successor(s): ir-bb<exit>
 ; CHECK-TF-EMPTY:
 ; CHECK-TF-NEXT:  ir-bb<exit>:
-; CHECK-TF-NEXT:    IR   %select.data.lcssa = phi i32 [ %select.data, %latch ] (extra operand: vp<[[VP17]]> from middle.block)
+; CHECK-TF-NEXT:    IR   %select.data.lcssa = phi i32 [ %select.data, %latch ] (extra operand: vp<[[VP16]]> from middle.block)
 ; CHECK-TF-NEXT:  No successors
 ; CHECK-TF-EMPTY:
 ; CHECK-TF-NEXT:  scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
index f28619b37f29c..10a8b1afa5801 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/predicator.ll
@@ -380,3 +380,257 @@ bb5:
 exit:
   ret void
 }
+
+;          bb0
+;         /  \
+;       bb1  bb2
+;       | \  / |
+;       |  bb4 |
+;        \  | /
+;          bb5
+;
+; The blend masks for %phi in bb4 should be:
+; bb1 := bb0->bb1 := c0
+; bb2 := bb0->bb2 := !c0
+define void @diamond_phi2(ptr %a, i1 %c1, i1 %c2) {
+; CHECK-LABEL: VPlan for loop in 'diamond_phi2'
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION nuw nsw ir<0>, ir<1>, vp<[[VP0:%[0-9]+]]>
+; CHECK-NEXT:      EMIT ir<%c0> = icmp sle ir<%iv>, ir<0>
+; CHECK-NEXT:    Successor(s): bb2
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb2:
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = not ir<%c0>
+; CHECK-NEXT:      EMIT ir<%add2> = add ir<%iv>, ir<2>, vp<[[VP4]]>
+; CHECK-NEXT:    Successor(s): bb1
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb1:
+; CHECK-NEXT:      EMIT ir<%add1> = add ir<%iv>, ir<1>, ir<%c0>
+; CHECK-NEXT:    Successor(s): bb4
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb4:
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = logical-and vp<[[VP4]]>, ir<%c2>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = logical-and ir<%c0>, ir<%c1>
+; CHECK-NEXT:      EMIT vp<[[VP7:%[0-9]+]]> = or vp<[[VP5]]>, vp<[[VP6]]>
+; CHECK-NEXT:      BLEND ir<%phi> = ir<%add2>/vp<[[VP5]]> ir<%add1>/vp<[[VP6]]>
+; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%a>, ir<%iv>
+; CHECK-NEXT:      EMIT store ir<%phi>, ir<%gep>, vp<[[VP7]]>
+; CHECK-NEXT:    Successor(s): bb5
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb5:
+; CHECK-NEXT:      EMIT ir<%iv.next> = add nuw nsw ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+;
+entry:
+  br label %bb0
+
+bb0:
+  %iv = phi i64 [0, %entry], [%iv.next, %bb5]
+  %c0 = icmp sle i64 %iv, 0
+  br i1 %c0, label %bb1, label %bb2
+
+bb1:
+  %add1 = add i64 %iv, 1
+  br i1 %c1, label %bb4, label %bb5
+
+bb2:
+  %add2 = add i64 %iv, 2
+  br i1 %c2, label %bb4, label %bb5
+
+bb4:
+  %phi = phi i64 [%add1, %bb1], [%add2, %bb2]
+  %gep = getelementptr i64, ptr %a, i64 %iv
+  store i64 %phi, ptr %gep
+  br label %bb5
+
+bb5:
+  %iv.next = add nsw nuw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %bb0
+
+exit:
+  ret void
+}
+
+;        bb0
+;       /   \
+;      bb1   bb2
+;     /  \     |
+;    bb3  bb4  |
+;   / \  / \   /
+;   \ bb5   bb6
+;    \  \  / /
+;     \ bb7 /
+;      \ | /
+;       bb8
+;
+; The blend masks for %phi in bb7 should be:
+; bb5 := bb1->bb3 v bb4->bb5 := (c0 && c1) || (c0 && !c1 && c3)
+; bb6 := bb4->bb6 v bb0->bb2 := (c0 && !c1 && !c3) || !c1
+define void @blend_masks(ptr noalias %p, i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4) {
+; CHECK-LABEL: VPlan for loop in 'blend_masks'
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0:%[0-9]+]]>
+; CHECK-NEXT:    Successor(s): bb2
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb2:
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = not ir<%c0>
+; CHECK-NEXT:    Successor(s): bb1
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb1:
+; CHECK-NEXT:    Successor(s): bb4
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb4:
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = not ir<%c1>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = logical-and ir<%c0>, vp<[[VP5]]>
+; CHECK-NEXT:    Successor(s): bb6
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb6:
+; CHECK-NEXT:      EMIT vp<[[VP7:%[0-9]+]]> = not ir<%c3>
+; CHECK-NEXT:      EMIT vp<[[VP8:%[0-9]+]]> = logical-and vp<[[VP6]]>, vp<[[VP7]]>
+; CHECK-NEXT:      EMIT vp<[[VP9:%[0-9]+]]> = or vp<[[VP8]]>, vp<[[VP4]]>
+; CHECK-NEXT:    Successor(s): bb3
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb3:
+; CHECK-NEXT:      EMIT vp<[[VP10:%[0-9]+]]> = logical-and ir<%c0>, ir<%c1>
+; CHECK-NEXT:    Successor(s): bb5
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb5:
+; CHECK-NEXT:      EMIT vp<[[VP11:%[0-9]+]]> = logical-and vp<[[VP6]]>, ir<%c3>
+; CHECK-NEXT:      EMIT vp<[[VP12:%[0-9]+]]> = not ir<%c2>
+; CHECK-NEXT:      EMIT vp<[[VP13:%[0-9]+]]> = logical-and vp<[[VP10]]>, vp<[[VP12]]>
+; CHECK-NEXT:      EMIT vp<[[VP14:%[0-9]+]]> = or vp<[[VP11]]>, vp<[[VP13]]>
+; CHECK-NEXT:    Successor(s): bb7
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb7:
+; CHECK-NEXT:      EMIT vp<[[VP15:%[0-9]+]]> = logical-and vp<[[VP9]]>, ir<%c4>
+; CHECK-NEXT:      EMIT vp<[[VP16:%[0-9]+]]> = or vp<[[VP15]]>, vp<[[VP14]]>
+; CHECK-NEXT:      BLEND ir<%phi> = ir<1>/vp<[[VP15]]> ir<0>/vp<[[VP14]]>
+; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
+; CHECK-NEXT:      EMIT store ir<%phi>, ir<%gep>, vp<[[VP16]]>
+; CHECK-NEXT:    Successor(s): bb8
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb8:
+; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+;
+entry:
+  br label %bb0
+
+bb0:
+  %iv = phi i32 [0, %entry], [%iv.next, %bb8]
+  br i1 %c0, label %bb1, label %bb2
+
+bb1:
+  br i1 %c1, label %bb3, label %bb4
+
+bb2:
+  br label %bb6
+
+bb3:
+  br i1 %c2, label %bb8, label %bb5
+
+bb4:
+  br i1 %c3, label %bb5, label %bb6
+
+bb5:
+  br label %bb7
+
+bb6:
+  br i1 %c4, label %bb7, label %bb8
+
+bb7:
+  %phi = phi i32 [0, %bb5], [1, %bb6]
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  store i32 %phi, ptr %gep
+  br label %bb8
+
+bb8:
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 128
+  br i1 %ec, label %exit, label %bb0
+
+exit:
+  ret void
+}
+
+;          bb0
+;         /  |
+;       bb1  |
+;       |  \ |
+;       |  bb2
+;       |  /
+;       bb3
+;
+; The blend masks for %phi in bb3 should be:
+; bb0 := bb1->bb2 v bb0->bb2 := c0 && !c1 || !c0
+; bb1 := bb1->bb3 := c0 && c1
+define void @blend_masks_triangle_phi(ptr noalias %p, i1 %c0, i1 %c1) {
+; CHECK-LABEL: VPlan for loop in 'blend_masks_triangle_phi'
+; CHECK-NEXT:  <x1> vector loop: {
+; CHECK-NEXT:    vector.body:
+; CHECK-NEXT:      EMIT vp<[[VP3:%[0-9]+]]> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:      ir<%iv> = WIDEN-INDUCTION ir<0>, ir<1>, vp<[[VP0:%[0-9]+]]>
+; CHECK-NEXT:    Successor(s): bb1
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb1:
+; CHECK-NEXT:    Successor(s): bb2
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb2:
+; CHECK-NEXT:      EMIT vp<[[VP4:%[0-9]+]]> = not ir<%c1>
+; CHECK-NEXT:      EMIT vp<[[VP5:%[0-9]+]]> = logical-and ir<%c0>, vp<[[VP4]]>
+; CHECK-NEXT:      EMIT vp<[[VP6:%[0-9]+]]> = not ir<%c0>
+; CHECK-NEXT:      EMIT vp<[[VP7:%[0-9]+]]> = or vp<[[VP5]]>, vp<[[VP6]]>
+; CHECK-NEXT:    Successor(s): bb3
+; CHECK-EMPTY:
+; CHECK-NEXT:    bb3:
+; CHECK-NEXT:      EMIT vp<[[VP8:%[0-9]+]]> = logical-and ir<%c0>, ir<%c1>
+; CHECK-NEXT:      BLEND ir<%phi> = ir<1>/vp<[[VP7]]> ir<0>/vp<[[VP8]]>
+; CHECK-NEXT:      EMIT ir<%gep> = getelementptr ir<%p>, ir<%iv>
+; CHECK-NEXT:      EMIT store ir<%phi>, ir<%gep>
+; CHECK-NEXT:      EMIT ir<%iv.next> = add ir<%iv>, ir<1>
+; CHECK-NEXT:      EMIT ir<%ec> = icmp eq ir<%iv.next>, ir<128>
+; CHECK-NEXT:      EMIT vp<%index.next> = add nuw vp<[[VP3]]>, vp<[[VP1:%[0-9]+]]>
+; CHECK-NEXT:      EMIT branch-on-count vp<%index.next>, vp<[[VP2:%[0-9]+]]>
+; CHECK-NEXT:    No successors
+; CHECK-NEXT:  }
+; CHECK-NEXT:  Successor(s): middle.block
+;
+entry:
+  br label %bb0
+
+bb0:
+  %iv = phi i32 [0, %entry], [%iv.next, %bb3]
+  br i1 %c0, label %bb1, label %bb2
+
+bb1:
+  br i1 %c1, label %bb3, label %bb2
+
+bb2:
+  br label %bb3
+
+bb3:
+  %phi = phi i32 [0, %bb1], [1, %bb2]
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  store i32 %phi, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 128
+  br i1 %ec, label %exit, label %bb0
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 94ebf01509ec2..da69c7a06a01d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -35,7 +35,6 @@ define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwi
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END5:%.*]] = add i64 3, [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
@@ -63,7 +62,7 @@ define void @conversion_cost1(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwi
 ; CHECK-NEXT:    [[CMP_N12:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N12]], label [[DOT_CRIT_EDGE_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 3, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[DOTLR_PH:%.*]]
 ; CHECK:       .lr.ph:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 58b21f9bc816d..99efdf03f2912 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -84,6 +84,7 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = mul i64 [[N_VEC]], 32
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -222,7 +223,6 @@ define float @PR27826(ptr nocapture readonly %a, ptr nocapture readonly %b, i32
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[LOOPEXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END9:%.*]] = mul i64 [[N_VEC]], 32
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
index 1b336eb495c91..36680495a59be 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll
@@ -53,7 +53,6 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END6:%.*]] = add i64 [[IV_START]], [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
@@ -87,7 +86,7 @@ define void @test_pr59459(i64 %iv.start, ptr %arr) {
 ; CHECK-NEXT:    [[CMP_N16:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[CMP_N16]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_SCEVCHECK]] ], [ [[IV_START]], [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i64 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IV_START]], [[VECTOR_SCEVCHECK]] ], [ [[IV_START]], [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -175,8 +174,6 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[L]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[DOTCAST9:%.*]] = trunc i64 [[N_VEC]] to i16
-; CHECK-NEXT:    [[IND_END10:%.*]] = mul i16 [[DOTCAST9]], [[TMP0]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
@@ -213,7 +210,7 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l,
 ; CHECK-NEXT:    br i1 [[CMP_N25]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi i16 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END10]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi i16 [ [[IND_END8]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index f964fc6f67854..a2dc1edf2345c 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -56,9 +56,6 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; AUTO_VEC:       [[VEC_EPILOG_ITER_CHECK]]:
-; AUTO_VEC-NEXT:    [[DOTCAST12:%.*]] = sitofp i64 [[N_VEC]] to float
-; AUTO_VEC-NEXT:    [[TMP11:%.*]] = fmul fast float 5.000000e-01, [[DOTCAST12]]
-; AUTO_VEC-NEXT:    [[IND_END1:%.*]] = fadd fast float 1.000000e+00, [[TMP11]]
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; AUTO_VEC:       [[VEC_EPILOG_PH]]:
@@ -87,7 +84,7 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N9]], label %[[FOR_END_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; AUTO_VEC:       [[VEC_EPILOG_SCALAR_PH]]:
 ; AUTO_VEC-NEXT:    [[BC_RESUME_VAL12:%.*]] = phi i64 [ [[N_VEC3]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi float [ [[TMP10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, %[[ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi float [ [[TMP10]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, %[[ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    br label %[[LOOP:.*]]
 ; AUTO_VEC:       [[LOOP]]:
 ; AUTO_VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[LOOP]] ], [ [[BC_RESUME_VAL12]], %[[VEC_EPILOG_SCALAR_PH]] ]
@@ -349,9 +346,6 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; AUTO_VEC:       [[VEC_EPILOG_ITER_CHECK]]:
-; AUTO_VEC-NEXT:    [[DOTCAST16:%.*]] = sitofp i64 [[N_VEC]] to float
-; AUTO_VEC-NEXT:    [[TMP12:%.*]] = fmul reassoc float 4.200000e+01, [[DOTCAST16]]
-; AUTO_VEC-NEXT:    [[IND_END1:%.*]] = fadd reassoc float 1.000000e+00, [[TMP12]]
 ; AUTO_VEC-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; AUTO_VEC-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; AUTO_VEC:       [[VEC_EPILOG_PH]]:
@@ -382,7 +376,7 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    br i1 [[CMP_N18]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; AUTO_VEC:       [[VEC_EPILOG_SCALAR_PH]]:
 ; AUTO_VEC-NEXT:    [[BC_RESUME_VAL16:%.*]] = phi i64 [ [[N_VEC6]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; AUTO_VEC-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi float [ [[TMP18]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, %[[ITER_CHECK]] ]
+; AUTO_VEC-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi float [ [[TMP18]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 1.000000e+00, %[[ITER_CHECK]] ]
 ; AUTO_VEC-NEXT:    br label %[[LOOP:.*]]
 ; AUTO_VEC:       [[LOOP]]:
 ; AUTO_VEC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL16]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
index aff665dad85a7..aa0de6ad15d70 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@@ -679,6 +679,8 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512:       vector.ph:
 ; AVX512-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
 ; AVX512-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; AVX512-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
+; AVX512-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
 ; AVX512-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 64
 ; AVX512-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP13]]
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -702,10 +704,6 @@ define void @test_gather_not_profitable_pr48429(i32 %d, ptr readonly noalias %pt
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AVX512:       vec.epilog.iter.check:
-; AVX512-NEXT:    [[TMP23:%.*]] = mul i64 [[N_VEC]], 4
-; AVX512-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[TMP23]]
-; AVX512-NEXT:    [[TMP38:%.*]] = mul i64 [[N_VEC]], 64
-; AVX512-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[DEST]], i64 [[TMP38]]
 ; AVX512-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; AVX512-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF17:![0-9]+]]
 ; AVX512:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index ad6dfb054b726..bd1fc6eba4a2e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -23,15 +23,21 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 64
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
+; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[OFFSET:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[OFFSET_IDX2]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[OFFSET_IDX2]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 32
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 48
@@ -57,12 +63,6 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[DOTCAST1:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END10:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END12:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END15:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
@@ -153,13 +153,17 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 128
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[DOTCAST8:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]]
+; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i8> poison, i8 [[OFFSET:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i8> [[BROADCAST_SPLATINSERT]], <32 x i8> poison, <32 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96
@@ -185,10 +189,6 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[DOTCAST8:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT:    [[IND_END9:%.*]] = sub i32 [[BLOCKSIZE]], [[DOTCAST8]]
-; CHECK-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT:    [[IND_END14:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 9ef4e205a970d..e318efdac5b32 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -67,9 +67,6 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT99:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP64:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT:    [[IND_END9:%.*]] = add i64 8, [[TMP64]]
-; CHECK-NEXT:    [[IND_END12:%.*]] = mul i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -110,8 +107,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N23:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC7]]
 ; CHECK-NEXT:    br i1 [[CMP_N23]], label %[[FOR_COND_CLEANUP_LOOPEXIT99]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i64 [ [[IND_END8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 8, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i64 [ [[IND_END11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END12]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL17:%.*]] = phi i64 [ [[IND_END8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 8, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i64 [ [[IND_END11]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[ITER_CHECK22]]:
 ; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP3]], -9
@@ -156,9 +153,6 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N40:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC32]]
 ; CHECK-NEXT:    br i1 [[CMP_N40]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK43:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK43]]:
-; CHECK-NEXT:    [[TMP42:%.*]] = mul i64 [[N_VEC32]], 2
-; CHECK-NEXT:    [[IND_END55:%.*]] = add i64 8, [[TMP42]]
-; CHECK-NEXT:    [[IND_END58:%.*]] = mul i64 [[N_VEC32]], 2
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK50:%.*]] = icmp ult i64 [[N_MOD_VF31]], 8
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK50]], label %[[VEC_EPILOG_SCALAR_PH42]], label %[[VEC_EPILOG_PH45]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH45]]:
@@ -204,8 +198,8 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
 ; CHECK-NEXT:    br i1 [[CMP_N65]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[VEC_EPILOG_SCALAR_PH42]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH42]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL65:%.*]] = phi i64 [ [[IND_END54]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END55]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[ITER_CHECK22]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL66:%.*]] = phi i64 [ [[IND_END57]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END58]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[ITER_CHECK22]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL65:%.*]] = phi i64 [ [[IND_END54]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END41]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 8, %[[ITER_CHECK22]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL66:%.*]] = phi i64 [ [[IND_END57]], %[[VEC_EPILOG_MIDDLE_BLOCK63]] ], [ [[IND_END43]], %[[VEC_EPILOG_ITER_CHECK43]] ], [ 0, %[[ITER_CHECK22]] ]
 ; CHECK-NEXT:    br label %[[FOR_BODY_US:.*]]
 ; CHECK:       [[FOR_BODY_US]]:
 ; CHECK-NEXT:    [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], %[[FOR_COND_CLEANUP4_US_LCSSA_US_US:.*]] ], [ [[BC_RESUME_VAL65]], %[[VEC_EPILOG_SCALAR_PH42]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-epilogue-vec.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-epilogue-vec.ll
index 64c174fb3eb2b..014392e0bf3f8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-epilogue-vec.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory-epilogue-vec.ll
@@ -53,10 +53,9 @@ define void @test_4xi64_epilogue_vec(ptr noalias %data, i64 noundef %n) {
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ITER_CHECK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[P0:%.*]] = getelementptr inbounds [[S4]], ptr [[DATA]], i64 [[IV]], i32 0
 ; CHECK-NEXT:    store i64 1, ptr [[P0]], align 8
 ; CHECK-NEXT:    [[P1:%.*]] = getelementptr inbounds [[S4]], ptr [[DATA]], i64 [[IV]], i32 1
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index a8d68692428dc..52526906bdeb7 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -83,10 +83,9 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
index 35ef01379c756..5d7b01943a0c8 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll
@@ -241,7 +241,6 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -279,7 +278,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
 ; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i8 [ [[TMP13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX14:%.*]] = phi i8 [ [[RDX_SELECT13]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
@@ -396,7 +395,7 @@ define i64 @test_vectorize_select_smin_first_idx(ptr %src, i64 %n) {
 ; CHECK-NEXT:    br i1 [[CMP_N21]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL22:%.*]] = phi i64 [ [[N_VEC5]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i64 [ [[TMP21]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX23:%.*]] = phi i64 [ [[TMP21]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX24:%.*]] = phi i64 [ [[TMP16]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
index 0607ec38e7e46..e5377cbfc35d3 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-any-of-reductions.ll
@@ -225,7 +225,6 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END6:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
@@ -263,7 +262,7 @@ define i1 @any_of_reduction_i1_epilog(i64 %N, i32 %a) {
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL4:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX17:%.*]] = phi i1 [ [[RDX_SELECT16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], [[VEC_EPILOG_ITER_CHECK]] ], [ false, [[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END6]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL7:%.*]] = phi i32 [ [[IND_END5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -318,6 +317,8 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[X]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -356,8 +357,6 @@ define i1 @any_of_reduction_i1_epilog2(ptr %start, ptr %end, i64 %x) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[N_VEC]], 16
-; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP24]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index 76f30decf81e7..4a0dbf893ca79 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -699,7 +699,6 @@ define i1 @reduction_with_const_or(ptr %A, i8 %n) {
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END:%.*]] = trunc i32 [[N_VEC]] to i8
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i32 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -727,8 +726,8 @@ define i1 @reduction_with_const_or(ptr %A, i8 %n) {
 ; CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N8]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ true, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ false, %[[VEC_EPILOG_ITER_CHECK]] ], [ false, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi i8 [ [[TMP8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i1 [ true, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ true, %[[VEC_EPILOG_ITER_CHECK]] ], [ false, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL9:%.*]] = phi i8 [ [[TMP8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP2]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[OR_RED:%.*]] = phi i1 [ [[BC_MERGE_RDX]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[OR_NEXT:%.*]], %[[LOOP]] ]
@@ -787,7 +786,6 @@ define i16 @test_no_op_or_reduction_single_vector_iteration(i64 %N) {
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i16 [ [[TMP0]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[CLAMPED]], 4
 ; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[CLAMPED]], [[N_MOD_VF2]]
@@ -830,3 +828,96 @@ loop:
 exit:
   ret i16 %red.next
 }
+
+; Test case for https://github.com/llvm/llvm-project/issues/179407.
+define i32 @test_foldable_reduction(i64 %N) {
+; CHECK-LABEL: define i32 @test_foldable_reduction(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[VEC_PHI]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[VEC_PHI]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i1 [[TMP3]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP4]], i32 0, i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP5]], %[[VEC_EPILOG_PH]] ], [ [[VEC_PHI6]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[VEC_PHI6]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP9:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT:    [[RDX_SELECT8:%.*]] = select i1 [[TMP9]], i32 0, i32 0
+; CHECK-NEXT:    [[CMP_N9:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[CMP_N9]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[TMP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP2]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i32 [ [[RDX_SELECT8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED_1:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_1_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED_2:%.*]] = phi i32 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_2_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[RED_2_NEXT]] = select i1 [[ICMP]], i32 0, i32 [[RED_2]]
+; CHECK-NEXT:    [[RED_1_NEXT]] = or i32 [[RED_1]], 0
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP32:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[R_1:%.*]] = phi i32 [ [[RED_2_NEXT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[R_2:%.*]] = phi i32 [ [[RED_1_NEXT]], %[[LOOP]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[R:%.*]] = add i32 [[R_1]], [[R_2]]
+; CHECK-NEXT:    ret i32 [[R]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %red.1 = phi i32 [ 0, %entry ], [ %red.1.next, %loop ]
+  %red.2 = phi i32 [ 0, %entry ], [ %red.2.next, %loop ]
+  %icmp = icmp eq i32 0, 0
+  %red.2.next = select i1 %icmp, i32 0, i32 %red.2
+  %red.1.next= or i32 %red.1, 0
+  %iv.next = add i64 %iv, 1
+  %ec = icmp ne i64 %iv, %N
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %r.1 = phi i32 [ %red.2.next, %loop ]
+  %r.2 = phi i32 [ %red.1.next, %loop ]
+  %r = add i32 %r.1, %r.2
+  ret i32 %r
+}
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 143da205f18ba..b20a3b0063834 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -165,6 +165,7 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END4:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -186,7 +187,6 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n)
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_END_LOOPEXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END4:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
@@ -469,7 +469,6 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[OUTER_LATCH]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-NEXT:    br i1 true, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -499,7 +498,7 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-NEXT:    br i1 true, label %[[OUTER_LATCH]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ 85, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, %[[VEC_EPILOG_ITER_CHECK]] ], [ 1, %[[ITER_CHECK]] ]
-; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label %[[INNER:.*]]
 ; CHECK:       [[INNER]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
@@ -555,7 +554,6 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[MIDDLE_BLOCK]]:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label %[[OUTER_LATCH]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IND_END5:%.*]] = mul i8 84, [[INDUCTION_IV]]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[VEC_EPILOG_PH]]:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
@@ -585,7 +583,7 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br i1 true, label %[[OUTER_LATCH]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[VEC_EPILOG_SCALAR_PH]]:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL13:%.*]] = phi i64 [ 85, %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 85, %[[VEC_EPILOG_ITER_CHECK]] ], [ 1, %[[ITER_CHECK]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[BC_RESUME_VAL14:%.*]] = phi i8 [ [[IND_END4]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    br label %[[INNER:.*]]
 ; CHECK-PROFITABLE-BY-DEFAULT:       [[INNER]]:
 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL13]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[INNER]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
index 00eeb69dcb0f7..e6e8035d85c89 100644
--- a/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr128062-interleaved-accesses-narrow-group.ll
@@ -16,37 +16,14 @@ define void @pr128062(ptr %dst.start, i8 %a, i16 %b) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST_START]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
-; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    [[TMP0:%.*]] = zext <4 x i8> [[STRIDED_VEC]] to <4 x i16>
-; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw <4 x i16> [[TMP0]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv <4 x i16> [[TMP1]], splat (i16 255)
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw <4 x i16> [[TMP2]] to <4 x i8>
-; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[STRIDED_VEC3]] to <4 x i16>
-; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw <4 x i16> [[TMP5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = udiv <4 x i16> [[TMP6]], splat (i16 255)
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc nuw <4 x i16> [[TMP7]] to <4 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <4 x i8> [[STRIDED_VEC4]] to <4 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nuw <4 x i16> [[TMP10]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = udiv <4 x i16> [[TMP11]], splat (i16 255)
-; CHECK-NEXT:    [[TMP13:%.*]] = trunc nuw <4 x i16> [[TMP12]] to <4 x i8>
-; CHECK-NEXT:    [[TMP14:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP13]]
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = load <4 x i8>, ptr [[NEXT_GEP]], align 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = zext <4 x i8> [[STRIDED_VEC5]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw <4 x i16> [[TMP15]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = udiv <4 x i16> [[TMP16]], splat (i16 255)
 ; CHECK-NEXT:    [[TMP18:%.*]] = trunc nuw <4 x i16> [[TMP17]] to <4 x i8>
 ; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i8> [[BROADCAST_SPLAT2]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP20]], <8 x i8> [[TMP21]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i8> [[TMP22]], <16 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
-; CHECK-NEXT:    store <16 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <4 x i8> [[TMP19]], ptr [[NEXT_GEP]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
 ; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/predicator.ll b/llvm/test/Transforms/LoopVectorize/predicator.ll
new file mode 100644
index 0000000000000..26013ab3784d8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/predicator.ll
@@ -0,0 +1,236 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt < %s -S -p loop-vectorize -force-vector-width=4 -force-target-supports-masked-memory-ops | FileCheck %s
+
+;          bb0
+;         /  \
+;       bb1  bb2
+;       | \  / |
+;       |  bb4 |
+;        \  | /
+;          bb5
+;
+; The blend masks for %phi in bb4 should be:
+; bb1 := bb0->bb1 := c0
+; bb2 := bb0->bb2 := !c0
+define void @diamond_phi2(ptr %a, i1 %c1, i1 %c2) {
+; CHECK-LABEL: define void @diamond_phi2(
+; CHECK-SAME: ptr [[A:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp sle <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[TMP0]], splat (i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP0]], <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = or <4 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[TMP3]], <4 x i64> [[TMP2]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP20]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i64.p0(<4 x i64> [[PREDPHI]], ptr align 4 [[TMP21]], <4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP20]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <4 x i64> [[VEC_IND]], splat (i64 4)
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb0
+
+bb0:
+  %iv = phi i64 [0, %entry], [%iv.next, %bb5]
+  %c0 = icmp sle i64 %iv, 0
+  br i1 %c0, label %bb1, label %bb2
+
+bb1:
+  %add1 = add i64 %iv, 1
+  br i1 %c1, label %bb4, label %bb5
+
+bb2:
+  %add2 = add i64 %iv, 2
+  br i1 %c2, label %bb4, label %bb5
+
+bb4:
+  %phi = phi i64 [%add1, %bb1], [%add2, %bb2]
+  %gep = getelementptr i64, ptr %a, i64 %iv
+  store i64 %phi, ptr %gep
+  br label %bb5
+
+bb5:
+  %iv.next = add nsw nuw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 128
+  br i1 %ec, label %exit, label %bb0
+
+exit:
+  ret void
+}
+
+;        bb0
+;       /   \
+;      bb1   bb2
+;     /  \     |
+;    bb3  bb4  |
+;   / \  / \   /
+;   \ bb5   bb6
+;    \  \  / /
+;     \ bb7 /
+;      \ | /
+;       bb8
+;
+; The blend masks for %phi in bb7 should be:
+; bb5 := bb1->bb3 v bb4->bb5 := (c0 && c1) || (c0 && !c1 && c3)
+; bb6 := bb4->bb6 v bb0->bb2 := (c0 && !c1 && !c3) || !c1
+define void @blend_masks(ptr noalias %p, i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4) {
+; CHECK-LABEL: define void @blend_masks(
+; CHECK-SAME: ptr noalias [[P:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]], i1 [[C4:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i1> poison, i1 [[C3]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT3]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i1> poison, i1 [[C1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT5]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT7]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT8]], splat (i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT6]], splat (i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> [[TMP1]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT4]], splat (i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP3]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP4]], [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BROADCAST_SPLAT8]], <4 x i1> [[BROADCAST_SPLAT6]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[BROADCAST_SPLAT4]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
+; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP6]], <4 x i1> [[TMP8]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = or <4 x i1> [[TMP7]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 1)
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK:       [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT:    [[TMP26:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[P]], i32 [[TMP26]]
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[PREDPHI]], ptr align 4 [[TMP27]], <4 x i1> [[TMP12]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[TMP26]], 4
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[PRED_STORE_CONTINUE12]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb0
+
+bb0:
+  %iv = phi i32 [0, %entry], [%iv.next, %bb8]
+  br i1 %c0, label %bb1, label %bb2
+
+bb1:
+  br i1 %c1, label %bb3, label %bb4
+
+bb2:
+  br label %bb6
+
+bb3:
+  br i1 %c2, label %bb8, label %bb5
+
+bb4:
+  br i1 %c3, label %bb5, label %bb6
+
+bb5:
+  br label %bb7
+
+bb6:
+  br i1 %c4, label %bb7, label %bb8
+
+bb7:
+  %phi = phi i32 [0, %bb5], [1, %bb6]
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  store i32 %phi, ptr %gep
+  br label %bb8
+
+bb8:
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 128
+  br i1 %ec, label %exit, label %bb0
+
+exit:
+  ret void
+}
+
+;          bb0
+;         /  |
+;       bb1  |
+;       |  \ |
+;       |  bb2
+;       |  /
+;       bb3
+;
+; The blend masks for %phi in bb3 should be:
+; bb0 := bb1->bb2 v bb0->bb2 := c0 && !c1 || !c0
+; bb1 := bb1->bb3 := c0 && c1
+define void @blend_masks_triangle_phi(ptr noalias %p, i1 %c0, i1 %c1) {
+; CHECK-LABEL: define void @blend_masks_triangle_phi(
+; CHECK-SAME: ptr noalias [[P:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i1> poison, i1 [[C0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT1]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = select <4 x i1> [[BROADCAST_SPLAT2]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> zeroinitializer, <4 x i32> splat (i32 1)
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[P]], i32 [[INDEX]]
+; CHECK-NEXT:    store <4 x i32> [[PREDPHI]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb0
+
+bb0:
+  %iv = phi i32 [0, %entry], [%iv.next, %bb3]
+  br i1 %c0, label %bb1, label %bb2
+
+bb1:
+  br i1 %c1, label %bb3, label %bb2
+
+bb2:
+  br label %bb3
+
+bb3:
+  %phi = phi i32 [0, %bb1], [1, %bb2]
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  store i32 %phi, ptr %gep
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, 128
+  br i1 %ec, label %exit, label %bb0
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/avg.ll b/llvm/test/Transforms/PhaseOrdering/X86/avg.ll
new file mode 100644
index 0000000000000..7a4ea404037d1
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/avg.ll
@@ -0,0 +1,1355 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR128424
+
+%"struct.std::array8" = type { [8 x i16] }
+%"struct.std::array16" = type { [16 x i8] }
+
+define { i64, i64 } @avgr_16_u8(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
+; SSE2-LABEL: @avgr_16_u8(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[A_COERCE1:%.*]], i64 1
+; SSE2-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 16)
+; SSE2-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 24)
+; SSE2-NEXT:    [[TMP5:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32)
+; SSE2-NEXT:    [[TMP6:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 40)
+; SSE2-NEXT:    [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE2-NEXT:    [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56
+; SSE2-NEXT:    [[TMP7:%.*]] = trunc i64 [[A_COERCE1]] to i16
+; SSE2-NEXT:    [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE2-NEXT:    [[TMP8:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16
+; SSE2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0]], i64 0
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[B_COERCE1:%.*]], i64 1
+; SSE2-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 16)
+; SSE2-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 24)
+; SSE2-NEXT:    [[TMP13:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 32)
+; SSE2-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 40)
+; SSE2-NEXT:    [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE2-NEXT:    [[TMP15:%.*]] = trunc i64 [[B_COERCE1]] to i16
+; SSE2-NEXT:    [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE2-NEXT:    [[TMP16:%.*]] = and <2 x i64> [[TMP2]], splat (i64 255)
+; SSE2-NEXT:    [[TMP17:%.*]] = and <2 x i64> [[TMP10]], splat (i64 255)
+; SSE2-NEXT:    [[CONV1_14:%.*]] = and i64 [[A_SROA_16_8_EXTRACT_SHIFT]], 255
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP7]], i64 1
+; SSE2-NEXT:    [[TMP20:%.*]] = lshr <2 x i16> [[TMP19]], splat (i16 8)
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i64 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <2 x i16> [[TMP21]], i16 [[TMP15]], i64 1
+; SSE2-NEXT:    [[TMP23:%.*]] = lshr <2 x i16> [[TMP22]], splat (i16 8)
+; SSE2-NEXT:    [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56
+; SSE2-NEXT:    [[CONV1_6:%.*]] = and i64 [[A_SROA_7_0_EXTRACT_SHIFT]], 255
+; SSE2-NEXT:    [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56
+; SSE2-NEXT:    [[CONV4_6:%.*]] = and i64 [[B_SROA_7_0_EXTRACT_SHIFT]], 255
+; SSE2-NEXT:    [[TMP24:%.*]] = add nuw nsw <2 x i64> [[TMP16]], splat (i64 1)
+; SSE2-NEXT:    [[TMP25:%.*]] = add nuw nsw <2 x i64> [[TMP24]], [[TMP17]]
+; SSE2-NEXT:    [[TMP26:%.*]] = lshr <2 x i64> [[TMP25]], splat (i64 1)
+; SSE2-NEXT:    [[TMP27:%.*]] = add nuw nsw <2 x i16> [[TMP20]], splat (i16 1)
+; SSE2-NEXT:    [[TMP28:%.*]] = add nuw nsw <2 x i16> [[TMP27]], [[TMP23]]
+; SSE2-NEXT:    [[TMP29:%.*]] = and <2 x i64> [[TMP3]], splat (i64 255)
+; SSE2-NEXT:    [[TMP30:%.*]] = and <2 x i64> [[TMP11]], splat (i64 255)
+; SSE2-NEXT:    [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP29]], splat (i64 1)
+; SSE2-NEXT:    [[TMP32:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP30]]
+; SSE2-NEXT:    [[TMP33:%.*]] = and <2 x i64> [[TMP4]], splat (i64 255)
+; SSE2-NEXT:    [[TMP34:%.*]] = and <2 x i64> [[TMP12]], splat (i64 255)
+; SSE2-NEXT:    [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP33]], splat (i64 1)
+; SSE2-NEXT:    [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP34]]
+; SSE2-NEXT:    [[TMP37:%.*]] = and <2 x i64> [[TMP5]], splat (i64 255)
+; SSE2-NEXT:    [[TMP38:%.*]] = and <2 x i64> [[TMP13]], splat (i64 255)
+; SSE2-NEXT:    [[TMP39:%.*]] = add nuw nsw <2 x i64> [[TMP37]], splat (i64 1)
+; SSE2-NEXT:    [[TMP40:%.*]] = add nuw nsw <2 x i64> [[TMP39]], [[TMP38]]
+; SSE2-NEXT:    [[TMP41:%.*]] = and <2 x i64> [[TMP6]], splat (i64 255)
+; SSE2-NEXT:    [[TMP42:%.*]] = and <2 x i64> [[TMP14]], splat (i64 255)
+; SSE2-NEXT:    [[TMP43:%.*]] = add nuw nsw <2 x i64> [[TMP41]], splat (i64 1)
+; SSE2-NEXT:    [[TMP44:%.*]] = add nuw nsw <2 x i64> [[TMP43]], [[TMP42]]
+; SSE2-NEXT:    [[CONV4_14:%.*]] = and i64 [[B_SROA_16_8_EXTRACT_SHIFT]], 255
+; SSE2-NEXT:    [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_8_0_EXTRACT_SHIFT]], 1
+; SSE2-NEXT:    [[ADD_14:%.*]] = add nuw nsw i64 [[CONV1_14]], 1
+; SSE2-NEXT:    [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56
+; SSE2-NEXT:    [[ADD5_14:%.*]] = add nuw nsw i64 [[ADD_14]], [[CONV4_14]]
+; SSE2-NEXT:    [[ADD5_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_8_0_EXTRACT_SHIFT]]
+; SSE2-NEXT:    [[ADD_15:%.*]] = add nuw nsw i64 [[A_SROA_17_8_EXTRACT_SHIFT]], 1
+; SSE2-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[CONV1_6]], 1
+; SSE2-NEXT:    [[ADD5_15:%.*]] = add nuw nsw i64 [[ADD_15]], [[B_SROA_17_8_EXTRACT_SHIFT]]
+; SSE2-NEXT:    [[ADD5_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV4_6]]
+; SSE2-NEXT:    [[TMP45:%.*]] = shl nuw i64 [[ADD5_15]], 55
+; SSE2-NEXT:    [[TMP46:%.*]] = shl nuw nsw i64 [[ADD5_6]], 47
+; SSE2-NEXT:    [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = and i64 [[TMP45]], -72057594037927936
+; SSE2-NEXT:    [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = and i64 [[TMP46]], 71776119061217280
+; SSE2-NEXT:    [[TMP47:%.*]] = shl nuw nsw i64 [[ADD5_14]], 47
+; SSE2-NEXT:    [[TMP48:%.*]] = shl nuw i64 [[ADD5_7]], 55
+; SSE2-NEXT:    [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = and i64 [[TMP47]], 71776119061217280
+; SSE2-NEXT:    [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = and i64 [[TMP48]], -72057594037927936
+; SSE2-NEXT:    [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_EXT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_EXT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[TMP49:%.*]] = shl nuw nsw <2 x i64> [[TMP44]], splat (i64 39)
+; SSE2-NEXT:    [[TMP50:%.*]] = and <2 x i64> [[TMP49]], splat (i64 280375465082880)
+; SSE2-NEXT:    [[TMP51:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], i64 0
+; SSE2-NEXT:    [[TMP52:%.*]] = insertelement <2 x i64> [[TMP51]], i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], i64 1
+; SSE2-NEXT:    [[TMP53:%.*]] = or disjoint <2 x i64> [[TMP52]], [[TMP50]]
+; SSE2-NEXT:    [[TMP54:%.*]] = shl nuw nsw <2 x i64> [[TMP40]], splat (i64 31)
+; SSE2-NEXT:    [[TMP55:%.*]] = and <2 x i64> [[TMP54]], splat (i64 1095216660480)
+; SSE2-NEXT:    [[TMP56:%.*]] = or disjoint <2 x i64> [[TMP53]], [[TMP55]]
+; SSE2-NEXT:    [[TMP57:%.*]] = shl nuw nsw <2 x i64> [[TMP36]], splat (i64 23)
+; SSE2-NEXT:    [[TMP58:%.*]] = and <2 x i64> [[TMP57]], splat (i64 4278190080)
+; SSE2-NEXT:    [[TMP59:%.*]] = or disjoint <2 x i64> [[TMP56]], [[TMP58]]
+; SSE2-NEXT:    [[TMP60:%.*]] = shl nuw nsw <2 x i64> [[TMP32]], splat (i64 15)
+; SSE2-NEXT:    [[TMP61:%.*]] = and <2 x i64> [[TMP60]], splat (i64 16711680)
+; SSE2-NEXT:    [[TMP62:%.*]] = shl nuw <2 x i16> [[TMP28]], splat (i16 7)
+; SSE2-NEXT:    [[TMP63:%.*]] = or disjoint <2 x i64> [[TMP59]], [[TMP61]]
+; SSE2-NEXT:    [[TMP64:%.*]] = and <2 x i16> [[TMP62]], splat (i16 -256)
+; SSE2-NEXT:    [[TMP65:%.*]] = zext <2 x i16> [[TMP64]] to <2 x i64>
+; SSE2-NEXT:    [[TMP66:%.*]] = or <2 x i64> [[TMP63]], [[TMP65]]
+; SSE2-NEXT:    [[TMP67:%.*]] = or <2 x i64> [[TMP66]], [[TMP26]]
+; SSE2-NEXT:    [[TMP68:%.*]] = extractelement <2 x i64> [[TMP67]], i64 0
+; SSE2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP68]], 0
+; SSE2-NEXT:    [[TMP69:%.*]] = extractelement <2 x i64> [[TMP67]], i64 1
+; SSE2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP69]], 1
+; SSE2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; SSE4-LABEL: @avgr_16_u8(
+; SSE4-NEXT:  entry:
+; SSE4-NEXT:    [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16
+; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
+; SSE4-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[A_COERCE1:%.*]], i64 1
+; SSE4-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 16)
+; SSE4-NEXT:    [[TMP4:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 24)
+; SSE4-NEXT:    [[TMP5:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 32)
+; SSE4-NEXT:    [[TMP6:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 40)
+; SSE4-NEXT:    [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE4-NEXT:    [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56
+; SSE4-NEXT:    [[TMP7:%.*]] = trunc i64 [[A_COERCE1]] to i16
+; SSE4-NEXT:    [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE4-NEXT:    [[TMP8:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16
+; SSE4-NEXT:    [[TMP9:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0]], i64 0
+; SSE4-NEXT:    [[TMP10:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[B_COERCE1:%.*]], i64 1
+; SSE4-NEXT:    [[TMP11:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 16)
+; SSE4-NEXT:    [[TMP12:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 24)
+; SSE4-NEXT:    [[TMP13:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 32)
+; SSE4-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP10]], splat (i64 40)
+; SSE4-NEXT:    [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE4-NEXT:    [[TMP15:%.*]] = trunc i64 [[B_COERCE1]] to i16
+; SSE4-NEXT:    [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE4-NEXT:    [[TMP16:%.*]] = and <2 x i64> [[TMP2]], splat (i64 255)
+; SSE4-NEXT:    [[TMP17:%.*]] = and <2 x i64> [[TMP10]], splat (i64 255)
+; SSE4-NEXT:    [[CONV1_14:%.*]] = and i64 [[A_SROA_16_8_EXTRACT_SHIFT]], 255
+; SSE4-NEXT:    [[TMP18:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; SSE4-NEXT:    [[TMP19:%.*]] = insertelement <2 x i16> [[TMP18]], i16 [[TMP7]], i64 1
+; SSE4-NEXT:    [[TMP20:%.*]] = lshr <2 x i16> [[TMP19]], splat (i16 8)
+; SSE4-NEXT:    [[TMP21:%.*]] = insertelement <2 x i16> poison, i16 [[TMP8]], i64 0
+; SSE4-NEXT:    [[TMP22:%.*]] = insertelement <2 x i16> [[TMP21]], i16 [[TMP15]], i64 1
+; SSE4-NEXT:    [[TMP23:%.*]] = lshr <2 x i16> [[TMP22]], splat (i16 8)
+; SSE4-NEXT:    [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56
+; SSE4-NEXT:    [[CONV1_6:%.*]] = and i64 [[A_SROA_7_0_EXTRACT_SHIFT]], 255
+; SSE4-NEXT:    [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56
+; SSE4-NEXT:    [[CONV4_6:%.*]] = and i64 [[B_SROA_7_0_EXTRACT_SHIFT]], 255
+; SSE4-NEXT:    [[TMP24:%.*]] = add nuw nsw <2 x i64> [[TMP16]], splat (i64 1)
+; SSE4-NEXT:    [[TMP25:%.*]] = add nuw nsw <2 x i64> [[TMP24]], [[TMP17]]
+; SSE4-NEXT:    [[TMP26:%.*]] = lshr <2 x i64> [[TMP25]], splat (i64 1)
+; SSE4-NEXT:    [[TMP27:%.*]] = add nuw nsw <2 x i16> [[TMP20]], splat (i16 1)
+; SSE4-NEXT:    [[TMP28:%.*]] = add nuw nsw <2 x i16> [[TMP27]], [[TMP23]]
+; SSE4-NEXT:    [[TMP29:%.*]] = and <2 x i64> [[TMP3]], splat (i64 255)
+; SSE4-NEXT:    [[TMP30:%.*]] = and <2 x i64> [[TMP11]], splat (i64 255)
+; SSE4-NEXT:    [[TMP31:%.*]] = add nuw nsw <2 x i64> [[TMP29]], splat (i64 1)
+; SSE4-NEXT:    [[TMP32:%.*]] = add nuw nsw <2 x i64> [[TMP31]], [[TMP30]]
+; SSE4-NEXT:    [[TMP33:%.*]] = and <2 x i64> [[TMP4]], splat (i64 255)
+; SSE4-NEXT:    [[TMP34:%.*]] = and <2 x i64> [[TMP12]], splat (i64 255)
+; SSE4-NEXT:    [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP33]], splat (i64 1)
+; SSE4-NEXT:    [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP34]]
+; SSE4-NEXT:    [[TMP37:%.*]] = and <2 x i64> [[TMP5]], splat (i64 255)
+; SSE4-NEXT:    [[TMP38:%.*]] = and <2 x i64> [[TMP13]], splat (i64 255)
+; SSE4-NEXT:    [[TMP39:%.*]] = add nuw nsw <2 x i64> [[TMP37]], splat (i64 1)
+; SSE4-NEXT:    [[TMP40:%.*]] = add nuw nsw <2 x i64> [[TMP39]], [[TMP38]]
+; SSE4-NEXT:    [[TMP41:%.*]] = and <2 x i64> [[TMP6]], splat (i64 255)
+; SSE4-NEXT:    [[TMP42:%.*]] = and <2 x i64> [[TMP14]], splat (i64 255)
+; SSE4-NEXT:    [[TMP43:%.*]] = add nuw nsw <2 x i64> [[TMP41]], splat (i64 1)
+; SSE4-NEXT:    [[TMP44:%.*]] = add nuw nsw <2 x i64> [[TMP43]], [[TMP42]]
+; SSE4-NEXT:    [[CONV4_14:%.*]] = and i64 [[B_SROA_16_8_EXTRACT_SHIFT]], 255
+; SSE4-NEXT:    [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_8_0_EXTRACT_SHIFT]], 1
+; SSE4-NEXT:    [[ADD_14:%.*]] = add nuw nsw i64 [[CONV1_14]], 1
+; SSE4-NEXT:    [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56
+; SSE4-NEXT:    [[ADD5_14:%.*]] = add nuw nsw i64 [[ADD_14]], [[CONV4_14]]
+; SSE4-NEXT:    [[ADD5_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_8_0_EXTRACT_SHIFT]]
+; SSE4-NEXT:    [[ADD_15:%.*]] = add nuw nsw i64 [[A_SROA_17_8_EXTRACT_SHIFT]], 1
+; SSE4-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[CONV1_6]], 1
+; SSE4-NEXT:    [[ADD5_15:%.*]] = add nuw nsw i64 [[ADD_15]], [[B_SROA_17_8_EXTRACT_SHIFT]]
+; SSE4-NEXT:    [[ADD5_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV4_6]]
+; SSE4-NEXT:    [[TMP45:%.*]] = shl nuw i64 [[ADD5_15]], 55
+; SSE4-NEXT:    [[TMP46:%.*]] = shl nuw nsw i64 [[ADD5_6]], 47
+; SSE4-NEXT:    [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = and i64 [[TMP45]], -72057594037927936
+; SSE4-NEXT:    [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = and i64 [[TMP46]], 71776119061217280
+; SSE4-NEXT:    [[TMP47:%.*]] = shl nuw nsw i64 [[ADD5_14]], 47
+; SSE4-NEXT:    [[TMP48:%.*]] = shl nuw i64 [[ADD5_7]], 55
+; SSE4-NEXT:    [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = and i64 [[TMP47]], 71776119061217280
+; SSE4-NEXT:    [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = and i64 [[TMP48]], -72057594037927936
+; SSE4-NEXT:    [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_EXT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]]
+; SSE4-NEXT:    [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_EXT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]]
+; SSE4-NEXT:    [[TMP49:%.*]] = shl nuw nsw <2 x i64> [[TMP44]], splat (i64 39)
+; SSE4-NEXT:    [[TMP50:%.*]] = and <2 x i64> [[TMP49]], splat (i64 280375465082880)
+; SSE4-NEXT:    [[TMP51:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], i64 0
+; SSE4-NEXT:    [[TMP52:%.*]] = insertelement <2 x i64> [[TMP51]], i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], i64 1
+; SSE4-NEXT:    [[TMP53:%.*]] = or disjoint <2 x i64> [[TMP52]], [[TMP50]]
+; SSE4-NEXT:    [[TMP54:%.*]] = shl nuw nsw <2 x i64> [[TMP40]], splat (i64 31)
+; SSE4-NEXT:    [[TMP55:%.*]] = and <2 x i64> [[TMP54]], splat (i64 1095216660480)
+; SSE4-NEXT:    [[TMP56:%.*]] = or disjoint <2 x i64> [[TMP53]], [[TMP55]]
+; SSE4-NEXT:    [[TMP57:%.*]] = shl nuw nsw <2 x i64> [[TMP36]], splat (i64 23)
+; SSE4-NEXT:    [[TMP58:%.*]] = and <2 x i64> [[TMP57]], splat (i64 4278190080)
+; SSE4-NEXT:    [[TMP59:%.*]] = or disjoint <2 x i64> [[TMP56]], [[TMP58]]
+; SSE4-NEXT:    [[TMP60:%.*]] = shl nuw nsw <2 x i64> [[TMP32]], splat (i64 15)
+; SSE4-NEXT:    [[TMP61:%.*]] = and <2 x i64> [[TMP60]], splat (i64 16711680)
+; SSE4-NEXT:    [[TMP62:%.*]] = shl nuw <2 x i16> [[TMP28]], splat (i16 7)
+; SSE4-NEXT:    [[TMP63:%.*]] = or disjoint <2 x i64> [[TMP59]], [[TMP61]]
+; SSE4-NEXT:    [[TMP64:%.*]] = and <2 x i16> [[TMP62]], splat (i16 -256)
+; SSE4-NEXT:    [[TMP65:%.*]] = zext <2 x i16> [[TMP64]] to <2 x i64>
+; SSE4-NEXT:    [[TMP66:%.*]] = or <2 x i64> [[TMP63]], [[TMP65]]
+; SSE4-NEXT:    [[TMP67:%.*]] = or <2 x i64> [[TMP66]], [[TMP26]]
+; SSE4-NEXT:    [[TMP68:%.*]] = extractelement <2 x i64> [[TMP67]], i64 0
+; SSE4-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP68]], 0
+; SSE4-NEXT:    [[TMP69:%.*]] = extractelement <2 x i64> [[TMP67]], i64 1
+; SSE4-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP69]], 1
+; SSE4-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; AVX-LABEL: @avgr_16_u8(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i16
+; AVX-NEXT:    [[TMP1:%.*]] = lshr i16 [[TMP0]], 8
+; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i16
+; AVX-NEXT:    [[TMP3:%.*]] = lshr i16 [[TMP2]], 8
+; AVX-NEXT:    [[TMP4:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i16
+; AVX-NEXT:    [[TMP5:%.*]] = lshr i16 [[TMP4]], 8
+; AVX-NEXT:    [[TMP6:%.*]] = trunc i64 [[B_COERCE1:%.*]] to i16
+; AVX-NEXT:    [[TMP7:%.*]] = lshr i16 [[TMP6]], 8
+; AVX-NEXT:    [[CONV1:%.*]] = and i64 [[A_COERCE0]], 255
+; AVX-NEXT:    [[CONV4:%.*]] = and i64 [[B_COERCE0]], 255
+; AVX-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[CONV1]], 1
+; AVX-NEXT:    [[ADD5:%.*]] = add nuw nsw i64 [[ADD]], [[CONV4]]
+; AVX-NEXT:    [[ADD_1:%.*]] = add nuw nsw i16 [[TMP1]], 1
+; AVX-NEXT:    [[ADD5_1:%.*]] = add nuw nsw i16 [[ADD_1]], [[TMP5]]
+; AVX-NEXT:    [[CONV1_8:%.*]] = and i64 [[A_COERCE1]], 255
+; AVX-NEXT:    [[CONV4_8:%.*]] = and i64 [[B_COERCE1]], 255
+; AVX-NEXT:    [[ADD_8:%.*]] = add nuw nsw i64 [[CONV1_8]], 1
+; AVX-NEXT:    [[ADD5_8:%.*]] = add nuw nsw i64 [[ADD_8]], [[CONV4_8]]
+; AVX-NEXT:    [[ADD_9:%.*]] = add nuw nsw i16 [[TMP3]], 1
+; AVX-NEXT:    [[ADD5_9:%.*]] = add nuw nsw i16 [[ADD_9]], [[TMP7]]
+; AVX-NEXT:    [[TMP8:%.*]] = shl nuw i16 [[ADD5_1]], 7
+; AVX-NEXT:    [[TMP9:%.*]] = and i16 [[TMP8]], -256
+; AVX-NEXT:    [[TMP10:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 [[A_COERCE0]], i64 0
+; AVX-NEXT:    [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
+; AVX-NEXT:    [[TMP12:%.*]] = lshr <8 x i64> [[TMP11]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
+; AVX-NEXT:    [[TMP13:%.*]] = and <8 x i64> [[TMP12]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
+; AVX-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT_MASKED:%.*]] = zext i16 [[TMP9]] to i64
+; AVX-NEXT:    [[TMP14:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE0]], i64 0
+; AVX-NEXT:    [[TMP15:%.*]] = insertelement <8 x i64> [[TMP14]], i64 [[ADD5]], i64 6
+; AVX-NEXT:    [[TMP16:%.*]] = insertelement <8 x i64> [[TMP15]], i64 [[RETVAL_SROA_2_0_INSERT_SHIFT_MASKED]], i64 7
+; AVX-NEXT:    [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP16]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
+; AVX-NEXT:    [[TMP18:%.*]] = lshr <8 x i64> [[TMP17]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
+; AVX-NEXT:    [[TMP19:%.*]] = and <8 x i64> [[TMP18]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
+; AVX-NEXT:    [[TMP20:%.*]] = add nuw nsw <8 x i64> [[TMP13]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
+; AVX-NEXT:    [[TMP21:%.*]] = add nuw nsw <8 x i64> [[TMP19]], [[TMP20]]
+; AVX-NEXT:    [[TMP22:%.*]] = shl nuw <8 x i64> [[TMP21]], <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
+; AVX-NEXT:    [[TMP23:%.*]] = and <8 x i64> [[TMP22]], <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
+; AVX-NEXT:    [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP23]])
+; AVX-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
+; AVX-NEXT:    [[TMP25:%.*]] = shl nuw i16 [[ADD5_9]], 7
+; AVX-NEXT:    [[TMP26:%.*]] = and i16 [[TMP25]], -256
+; AVX-NEXT:    [[TMP27:%.*]] = insertelement <8 x i64> <i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 poison, i64 -1, i64 -1>, i64 [[A_COERCE1]], i64 0
+; AVX-NEXT:    [[TMP28:%.*]] = shufflevector <8 x i64> [[TMP27]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
+; AVX-NEXT:    [[TMP29:%.*]] = lshr <8 x i64> [[TMP28]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 0, i64 0>
+; AVX-NEXT:    [[TMP30:%.*]] = and <8 x i64> [[TMP29]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 0, i64 0>
+; AVX-NEXT:    [[RETVAL_SROA_11_8_INSERT_SHIFT_MASKED:%.*]] = zext i16 [[TMP26]] to i64
+; AVX-NEXT:    [[TMP31:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE1]], i64 0
+; AVX-NEXT:    [[TMP32:%.*]] = insertelement <8 x i64> [[TMP31]], i64 [[ADD5_8]], i64 6
+; AVX-NEXT:    [[TMP33:%.*]] = insertelement <8 x i64> [[TMP32]], i64 [[RETVAL_SROA_11_8_INSERT_SHIFT_MASKED]], i64 7
+; AVX-NEXT:    [[TMP34:%.*]] = shufflevector <8 x i64> [[TMP33]], <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 6, i32 7>
+; AVX-NEXT:    [[TMP35:%.*]] = lshr <8 x i64> [[TMP34]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 1, i64 0>
+; AVX-NEXT:    [[TMP36:%.*]] = and <8 x i64> [[TMP35]], <i64 -1, i64 255, i64 255, i64 255, i64 255, i64 255, i64 -1, i64 -1>
+; AVX-NEXT:    [[TMP37:%.*]] = add nuw nsw <8 x i64> [[TMP30]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 0, i64 0>
+; AVX-NEXT:    [[TMP38:%.*]] = add nuw nsw <8 x i64> [[TMP36]], [[TMP37]]
+; AVX-NEXT:    [[TMP39:%.*]] = shl nuw <8 x i64> [[TMP38]], <i64 55, i64 47, i64 39, i64 31, i64 23, i64 15, i64 0, i64 0>
+; AVX-NEXT:    [[TMP40:%.*]] = and <8 x i64> [[TMP39]], <i64 -72057594037927936, i64 71776119061217280, i64 280375465082880, i64 1095216660480, i64 4278190080, i64 16711680, i64 -1, i64 -1>
+; AVX-NEXT:    [[TMP41:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP40]])
+; AVX-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP41]], 1
+; AVX-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+entry:
+  %retval = alloca %"struct.std::array16", align 1
+  %a = alloca %"struct.std::array16", align 1
+  %b = alloca %"struct.std::array16", align 1
+  store i64 %a.coerce0, ptr %a, align 1
+  %0 = getelementptr inbounds nuw i8, ptr %a, i64 8
+  store i64 %a.coerce1, ptr %0, align 1
+  store i64 %b.coerce0, ptr %b, align 1
+  %1 = getelementptr inbounds nuw i8, ptr %b, i64 8
+  store i64 %b.coerce1, ptr %1, align 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp samesign ult i64 %i.0, 16
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %.fca.0.load = load i64, ptr %retval, align 1
+  %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
+  %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
+  %.fca.1.load = load i64, ptr %.fca.1.gep, align 1
+  %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
+  ret { i64, i64 } %.fca.1.insert
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx.i = getelementptr inbounds nuw i8, ptr %a, i64 %i.0
+  %2 = load i8, ptr %arrayidx.i, align 1
+  %conv1 = zext i8 %2 to i16
+  %arrayidx.i12 = getelementptr inbounds nuw i8, ptr %b, i64 %i.0
+  %3 = load i8, ptr %arrayidx.i12, align 1
+  %conv4 = zext i8 %3 to i16
+  %add = add nuw nsw i16 %conv1, %conv4
+  %add5 = add nuw nsw i16 %add, 1
+  %shr = lshr i16 %add5, 1
+  %conv6 = trunc i16 %shr to i8
+  %arrayidx.i13 = getelementptr inbounds nuw i8, ptr %retval, i64 %i.0
+  store i8 %conv6, ptr %arrayidx.i13, align 1
+  %inc = add nuw nsw i64 %i.0, 1
+  br label %for.cond
+}
+
+define { i64, i64 } @avgr_16_u8_alt(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
+; SSE2-LABEL: @avgr_16_u8_alt(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i8
+; SSE2-NEXT:    [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 8
+; SSE2-NEXT:    [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16
+; SSE2-NEXT:    [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 24
+; SSE2-NEXT:    [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; SSE2-NEXT:    [[A_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_5_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 40
+; SSE2-NEXT:    [[A_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_6_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE2-NEXT:    [[A_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56
+; SSE2-NEXT:    [[A_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_8_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i8
+; SSE2-NEXT:    [[A_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 8
+; SSE2-NEXT:    [[A_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_11_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16
+; SSE2-NEXT:    [[A_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_12_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 24
+; SSE2-NEXT:    [[A_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_13_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; SSE2-NEXT:    [[A_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_14_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 40
+; SSE2-NEXT:    [[A_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_15_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE2-NEXT:    [[A_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_16_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56
+; SSE2-NEXT:    [[A_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_17_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i8
+; SSE2-NEXT:    [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 8
+; SSE2-NEXT:    [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16
+; SSE2-NEXT:    [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 24
+; SSE2-NEXT:    [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; SSE2-NEXT:    [[B_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_5_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 40
+; SSE2-NEXT:    [[B_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_6_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE2-NEXT:    [[B_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56
+; SSE2-NEXT:    [[B_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_8_0_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1:%.*]] to i8
+; SSE2-NEXT:    [[B_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 8
+; SSE2-NEXT:    [[B_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_11_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16
+; SSE2-NEXT:    [[B_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_12_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 24
+; SSE2-NEXT:    [[B_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_13_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; SSE2-NEXT:    [[B_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_14_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 40
+; SSE2-NEXT:    [[B_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_15_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE2-NEXT:    [[B_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_16_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56
+; SSE2-NEXT:    [[B_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_17_8_EXTRACT_SHIFT]] to i8
+; SSE2-NEXT:    [[SHR:%.*]] = lshr i8 [[A_SROA_0_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5:%.*]] = lshr i8 [[B_SROA_0_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW:%.*]] = add nuw i8 [[SHR5]], [[SHR]]
+; SSE2-NEXT:    [[OR21:%.*]] = or i8 [[B_SROA_0_0_EXTRACT_TRUNC]], [[A_SROA_0_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP0:%.*]] = and i8 [[OR21]], 1
+; SSE2-NEXT:    [[ADD12:%.*]] = add nuw i8 [[NARROW]], [[TMP0]]
+; SSE2-NEXT:    [[SHR_1:%.*]] = lshr i8 [[A_SROA_2_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_1:%.*]] = lshr i8 [[B_SROA_2_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_1:%.*]] = add nuw i8 [[SHR5_1]], [[SHR_1]]
+; SSE2-NEXT:    [[OR21_1:%.*]] = or i8 [[B_SROA_2_0_EXTRACT_TRUNC]], [[A_SROA_2_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP1:%.*]] = and i8 [[OR21_1]], 1
+; SSE2-NEXT:    [[ADD12_1:%.*]] = add nuw i8 [[NARROW_1]], [[TMP1]]
+; SSE2-NEXT:    [[SHR_2:%.*]] = lshr i8 [[A_SROA_3_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_2:%.*]] = lshr i8 [[B_SROA_3_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_2:%.*]] = add nuw i8 [[SHR5_2]], [[SHR_2]]
+; SSE2-NEXT:    [[OR21_2:%.*]] = or i8 [[B_SROA_3_0_EXTRACT_TRUNC]], [[A_SROA_3_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP2:%.*]] = and i8 [[OR21_2]], 1
+; SSE2-NEXT:    [[ADD12_2:%.*]] = add nuw i8 [[NARROW_2]], [[TMP2]]
+; SSE2-NEXT:    [[SHR_3:%.*]] = lshr i8 [[A_SROA_4_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_3:%.*]] = lshr i8 [[B_SROA_4_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_3:%.*]] = add nuw i8 [[SHR5_3]], [[SHR_3]]
+; SSE2-NEXT:    [[OR21_3:%.*]] = or i8 [[B_SROA_4_0_EXTRACT_TRUNC]], [[A_SROA_4_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP3:%.*]] = and i8 [[OR21_3]], 1
+; SSE2-NEXT:    [[ADD12_3:%.*]] = add nuw i8 [[NARROW_3]], [[TMP3]]
+; SSE2-NEXT:    [[SHR_4:%.*]] = lshr i8 [[A_SROA_5_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_4:%.*]] = lshr i8 [[B_SROA_5_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_4:%.*]] = add nuw i8 [[SHR5_4]], [[SHR_4]]
+; SSE2-NEXT:    [[OR21_4:%.*]] = or i8 [[B_SROA_5_0_EXTRACT_TRUNC]], [[A_SROA_5_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP4:%.*]] = and i8 [[OR21_4]], 1
+; SSE2-NEXT:    [[ADD12_4:%.*]] = add nuw i8 [[NARROW_4]], [[TMP4]]
+; SSE2-NEXT:    [[SHR_5:%.*]] = lshr i8 [[A_SROA_6_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_5:%.*]] = lshr i8 [[B_SROA_6_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_5:%.*]] = add nuw i8 [[SHR5_5]], [[SHR_5]]
+; SSE2-NEXT:    [[OR21_5:%.*]] = or i8 [[B_SROA_6_0_EXTRACT_TRUNC]], [[A_SROA_6_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP5:%.*]] = and i8 [[OR21_5]], 1
+; SSE2-NEXT:    [[ADD12_5:%.*]] = add nuw i8 [[NARROW_5]], [[TMP5]]
+; SSE2-NEXT:    [[SHR_6:%.*]] = lshr i8 [[A_SROA_7_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_6:%.*]] = lshr i8 [[B_SROA_7_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_6:%.*]] = add nuw i8 [[SHR5_6]], [[SHR_6]]
+; SSE2-NEXT:    [[OR21_6:%.*]] = or i8 [[B_SROA_7_0_EXTRACT_TRUNC]], [[A_SROA_7_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP6:%.*]] = and i8 [[OR21_6]], 1
+; SSE2-NEXT:    [[ADD12_6:%.*]] = add nuw i8 [[NARROW_6]], [[TMP6]]
+; SSE2-NEXT:    [[SHR_7:%.*]] = lshr i8 [[A_SROA_8_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_7:%.*]] = lshr i8 [[B_SROA_8_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_7:%.*]] = add nuw i8 [[SHR5_7]], [[SHR_7]]
+; SSE2-NEXT:    [[OR21_7:%.*]] = or i8 [[B_SROA_8_0_EXTRACT_TRUNC]], [[A_SROA_8_0_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP7:%.*]] = and i8 [[OR21_7]], 1
+; SSE2-NEXT:    [[ADD12_7:%.*]] = add nuw i8 [[NARROW_7]], [[TMP7]]
+; SSE2-NEXT:    [[SHR_8:%.*]] = lshr i8 [[A_SROA_9_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_8:%.*]] = lshr i8 [[B_SROA_9_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_8:%.*]] = add nuw i8 [[SHR5_8]], [[SHR_8]]
+; SSE2-NEXT:    [[OR21_8:%.*]] = or i8 [[B_SROA_9_8_EXTRACT_TRUNC]], [[A_SROA_9_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP8:%.*]] = and i8 [[OR21_8]], 1
+; SSE2-NEXT:    [[ADD12_8:%.*]] = add nuw i8 [[NARROW_8]], [[TMP8]]
+; SSE2-NEXT:    [[SHR_9:%.*]] = lshr i8 [[A_SROA_11_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_9:%.*]] = lshr i8 [[B_SROA_11_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_9:%.*]] = add nuw i8 [[SHR5_9]], [[SHR_9]]
+; SSE2-NEXT:    [[OR21_9:%.*]] = or i8 [[B_SROA_11_8_EXTRACT_TRUNC]], [[A_SROA_11_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP9:%.*]] = and i8 [[OR21_9]], 1
+; SSE2-NEXT:    [[ADD12_9:%.*]] = add nuw i8 [[NARROW_9]], [[TMP9]]
+; SSE2-NEXT:    [[SHR_10:%.*]] = lshr i8 [[A_SROA_12_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_10:%.*]] = lshr i8 [[B_SROA_12_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_10:%.*]] = add nuw i8 [[SHR5_10]], [[SHR_10]]
+; SSE2-NEXT:    [[OR21_10:%.*]] = or i8 [[B_SROA_12_8_EXTRACT_TRUNC]], [[A_SROA_12_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP10:%.*]] = and i8 [[OR21_10]], 1
+; SSE2-NEXT:    [[ADD12_10:%.*]] = add nuw i8 [[NARROW_10]], [[TMP10]]
+; SSE2-NEXT:    [[SHR_11:%.*]] = lshr i8 [[A_SROA_13_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_11:%.*]] = lshr i8 [[B_SROA_13_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_11:%.*]] = add nuw i8 [[SHR5_11]], [[SHR_11]]
+; SSE2-NEXT:    [[OR21_11:%.*]] = or i8 [[B_SROA_13_8_EXTRACT_TRUNC]], [[A_SROA_13_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP11:%.*]] = and i8 [[OR21_11]], 1
+; SSE2-NEXT:    [[ADD12_11:%.*]] = add nuw i8 [[NARROW_11]], [[TMP11]]
+; SSE2-NEXT:    [[SHR_12:%.*]] = lshr i8 [[A_SROA_14_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_12:%.*]] = lshr i8 [[B_SROA_14_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_12:%.*]] = add nuw i8 [[SHR5_12]], [[SHR_12]]
+; SSE2-NEXT:    [[OR21_12:%.*]] = or i8 [[B_SROA_14_8_EXTRACT_TRUNC]], [[A_SROA_14_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP12:%.*]] = and i8 [[OR21_12]], 1
+; SSE2-NEXT:    [[ADD12_12:%.*]] = add nuw i8 [[NARROW_12]], [[TMP12]]
+; SSE2-NEXT:    [[SHR_13:%.*]] = lshr i8 [[A_SROA_15_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_13:%.*]] = lshr i8 [[B_SROA_15_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_13:%.*]] = add nuw i8 [[SHR5_13]], [[SHR_13]]
+; SSE2-NEXT:    [[OR21_13:%.*]] = or i8 [[B_SROA_15_8_EXTRACT_TRUNC]], [[A_SROA_15_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP13:%.*]] = and i8 [[OR21_13]], 1
+; SSE2-NEXT:    [[ADD12_13:%.*]] = add nuw i8 [[NARROW_13]], [[TMP13]]
+; SSE2-NEXT:    [[SHR_14:%.*]] = lshr i8 [[A_SROA_16_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_14:%.*]] = lshr i8 [[B_SROA_16_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_14:%.*]] = add nuw i8 [[SHR5_14]], [[SHR_14]]
+; SSE2-NEXT:    [[OR21_14:%.*]] = or i8 [[B_SROA_16_8_EXTRACT_TRUNC]], [[A_SROA_16_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP14:%.*]] = and i8 [[OR21_14]], 1
+; SSE2-NEXT:    [[ADD12_14:%.*]] = add nuw i8 [[NARROW_14]], [[TMP14]]
+; SSE2-NEXT:    [[SHR_15:%.*]] = lshr i8 [[A_SROA_17_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_15:%.*]] = lshr i8 [[B_SROA_17_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_15:%.*]] = add nuw i8 [[SHR5_15]], [[SHR_15]]
+; SSE2-NEXT:    [[OR21_15:%.*]] = or i8 [[B_SROA_17_8_EXTRACT_TRUNC]], [[A_SROA_17_8_EXTRACT_TRUNC]]
+; SSE2-NEXT:    [[TMP15:%.*]] = and i8 [[OR21_15]], 1
+; SSE2-NEXT:    [[ADD12_15:%.*]] = add nuw i8 [[NARROW_15]], [[TMP15]]
+; SSE2-NEXT:    [[RETVAL_SROA_8_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_7]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_8_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_8_0_INSERT_EXT]], 56
+; SSE2-NEXT:    [[RETVAL_SROA_7_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_6]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_7_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_0_INSERT_EXT]], 48
+; SSE2-NEXT:    [[RETVAL_SROA_7_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_0_INSERT_SHIFT]], [[RETVAL_SROA_7_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_6_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_5]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_6_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_6_0_INSERT_EXT]], 40
+; SSE2-NEXT:    [[RETVAL_SROA_6_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_7_0_INSERT_INSERT]], [[RETVAL_SROA_6_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_5_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_4]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_5_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_5_0_INSERT_EXT]], 32
+; SSE2-NEXT:    [[RETVAL_SROA_5_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_6_0_INSERT_INSERT]], [[RETVAL_SROA_5_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_3]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 24
+; SSE2-NEXT:    [[RETVAL_SROA_4_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_5_0_INSERT_INSERT]], [[RETVAL_SROA_4_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_2]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 16
+; SSE2-NEXT:    [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i8 [[ADD12_1]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 8
+; SSE2-NEXT:    [[RETVAL_SROA_2_0_INSERT_MASK:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_INSERT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i8 [[ADD12]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_0_0_INSERT_MASK:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_MASK]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_0_0_INSERT_MASK]], [[RETVAL_SROA_0_0_INSERT_EXT]]
+; SSE2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
+; SSE2-NEXT:    [[RETVAL_SROA_17_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_15]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_17_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_17_8_INSERT_EXT]], 56
+; SSE2-NEXT:    [[RETVAL_SROA_16_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_14]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_16_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_16_8_INSERT_EXT]], 48
+; SSE2-NEXT:    [[RETVAL_SROA_16_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_17_8_INSERT_SHIFT]], [[RETVAL_SROA_16_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_15_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_13]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_15_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_15_8_INSERT_EXT]], 40
+; SSE2-NEXT:    [[RETVAL_SROA_15_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_16_8_INSERT_INSERT]], [[RETVAL_SROA_15_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_14_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_12]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_14_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_14_8_INSERT_EXT]], 32
+; SSE2-NEXT:    [[RETVAL_SROA_14_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_15_8_INSERT_INSERT]], [[RETVAL_SROA_14_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_13_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_11]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_13_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_13_8_INSERT_EXT]], 24
+; SSE2-NEXT:    [[RETVAL_SROA_13_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_14_8_INSERT_INSERT]], [[RETVAL_SROA_13_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_12_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_10]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_12_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_12_8_INSERT_EXT]], 16
+; SSE2-NEXT:    [[RETVAL_SROA_11_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_9]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_11_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_11_8_INSERT_EXT]], 8
+; SSE2-NEXT:    [[RETVAL_SROA_11_8_INSERT_MASK:%.*]] = or disjoint i64 [[RETVAL_SROA_13_8_INSERT_INSERT]], [[RETVAL_SROA_12_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i8 [[ADD12_8]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_9_8_INSERT_MASK:%.*]] = or i64 [[RETVAL_SROA_11_8_INSERT_MASK]], [[RETVAL_SROA_11_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_9_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_MASK]], [[RETVAL_SROA_9_8_INSERT_EXT]]
+; SSE2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_9_8_INSERT_INSERT]], 1
+; SSE2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; SSE4-LABEL: @avgr_16_u8_alt(
+; SSE4-NEXT:  entry:
+; SSE4-NEXT:    [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 8
+; SSE4-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 16
+; SSE4-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 24
+; SSE4-NEXT:    [[A_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; SSE4-NEXT:    [[A_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 40
+; SSE4-NEXT:    [[A_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE4-NEXT:    [[A_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 56
+; SSE4-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i8
+; SSE4-NEXT:    [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_5_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_6_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_8_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 8
+; SSE4-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 16
+; SSE4-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 24
+; SSE4-NEXT:    [[B_SROA_5_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; SSE4-NEXT:    [[B_SROA_6_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 40
+; SSE4-NEXT:    [[B_SROA_7_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE4-NEXT:    [[B_SROA_8_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 56
+; SSE4-NEXT:    [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i8
+; SSE4-NEXT:    [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_5_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_5_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_6_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_6_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_7_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_8_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_8_0_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[TMP0:%.*]] = insertelement <8 x i8> poison, i8 [[A_SROA_8_0_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <8 x i8> [[TMP0]], i8 [[A_SROA_7_0_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP2:%.*]] = insertelement <8 x i8> [[TMP1]], i8 [[A_SROA_6_0_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP3:%.*]] = insertelement <8 x i8> [[TMP2]], i8 [[A_SROA_5_0_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP4:%.*]] = insertelement <8 x i8> [[TMP3]], i8 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 4
+; SSE4-NEXT:    [[TMP5:%.*]] = insertelement <8 x i8> [[TMP4]], i8 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 5
+; SSE4-NEXT:    [[TMP6:%.*]] = insertelement <8 x i8> [[TMP5]], i8 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 6
+; SSE4-NEXT:    [[TMP7:%.*]] = insertelement <8 x i8> [[TMP6]], i8 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 7
+; SSE4-NEXT:    [[TMP8:%.*]] = lshr <8 x i8> [[TMP7]], splat (i8 1)
+; SSE4-NEXT:    [[TMP9:%.*]] = insertelement <8 x i8> poison, i8 [[B_SROA_8_0_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP10:%.*]] = insertelement <8 x i8> [[TMP9]], i8 [[B_SROA_7_0_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP11:%.*]] = insertelement <8 x i8> [[TMP10]], i8 [[B_SROA_6_0_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP12:%.*]] = insertelement <8 x i8> [[TMP11]], i8 [[B_SROA_5_0_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP13:%.*]] = insertelement <8 x i8> [[TMP12]], i8 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 4
+; SSE4-NEXT:    [[TMP14:%.*]] = insertelement <8 x i8> [[TMP13]], i8 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 5
+; SSE4-NEXT:    [[TMP15:%.*]] = insertelement <8 x i8> [[TMP14]], i8 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 6
+; SSE4-NEXT:    [[TMP16:%.*]] = insertelement <8 x i8> [[TMP15]], i8 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 7
+; SSE4-NEXT:    [[TMP17:%.*]] = lshr <8 x i8> [[TMP16]], splat (i8 1)
+; SSE4-NEXT:    [[TMP18:%.*]] = add nuw <8 x i8> [[TMP17]], [[TMP8]]
+; SSE4-NEXT:    [[TMP19:%.*]] = or <8 x i8> [[TMP16]], [[TMP7]]
+; SSE4-NEXT:    [[TMP20:%.*]] = and <8 x i8> [[TMP19]], splat (i8 1)
+; SSE4-NEXT:    [[TMP21:%.*]] = add nuw <8 x i8> [[TMP18]], [[TMP20]]
+; SSE4-NEXT:    [[TMP22:%.*]] = zext <8 x i8> [[TMP21]] to <8 x i64>
+; SSE4-NEXT:    [[TMP23:%.*]] = shl nuw <8 x i64> [[TMP22]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; SSE4-NEXT:    [[TMP24:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP23]])
+; SSE4-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0
+; SSE4-NEXT:    [[A_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 8
+; SSE4-NEXT:    [[A_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 16
+; SSE4-NEXT:    [[A_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 24
+; SSE4-NEXT:    [[A_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; SSE4-NEXT:    [[A_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 40
+; SSE4-NEXT:    [[A_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE4-NEXT:    [[A_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 56
+; SSE4-NEXT:    [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i8
+; SSE4-NEXT:    [[A_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_11_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_12_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_13_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_14_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_15_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_16_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[A_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_17_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_11_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 8
+; SSE4-NEXT:    [[B_SROA_12_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 16
+; SSE4-NEXT:    [[B_SROA_13_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 24
+; SSE4-NEXT:    [[B_SROA_14_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; SSE4-NEXT:    [[B_SROA_15_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 40
+; SSE4-NEXT:    [[B_SROA_16_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE4-NEXT:    [[B_SROA_17_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 56
+; SSE4-NEXT:    [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i8
+; SSE4-NEXT:    [[B_SROA_11_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_11_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_12_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_12_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_13_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_13_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_14_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_14_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_15_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_15_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_16_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_16_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[B_SROA_17_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_17_8_EXTRACT_SHIFT]] to i8
+; SSE4-NEXT:    [[TMP25:%.*]] = insertelement <8 x i8> poison, i8 [[A_SROA_17_8_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP26:%.*]] = insertelement <8 x i8> [[TMP25]], i8 [[A_SROA_16_8_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP27:%.*]] = insertelement <8 x i8> [[TMP26]], i8 [[A_SROA_15_8_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP28:%.*]] = insertelement <8 x i8> [[TMP27]], i8 [[A_SROA_14_8_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP29:%.*]] = insertelement <8 x i8> [[TMP28]], i8 [[A_SROA_13_8_EXTRACT_TRUNC]], i64 4
+; SSE4-NEXT:    [[TMP30:%.*]] = insertelement <8 x i8> [[TMP29]], i8 [[A_SROA_12_8_EXTRACT_TRUNC]], i64 5
+; SSE4-NEXT:    [[TMP31:%.*]] = insertelement <8 x i8> [[TMP30]], i8 [[A_SROA_11_8_EXTRACT_TRUNC]], i64 6
+; SSE4-NEXT:    [[TMP32:%.*]] = insertelement <8 x i8> [[TMP31]], i8 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 7
+; SSE4-NEXT:    [[TMP33:%.*]] = lshr <8 x i8> [[TMP32]], splat (i8 1)
+; SSE4-NEXT:    [[TMP34:%.*]] = insertelement <8 x i8> poison, i8 [[B_SROA_17_8_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP35:%.*]] = insertelement <8 x i8> [[TMP34]], i8 [[B_SROA_16_8_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP36:%.*]] = insertelement <8 x i8> [[TMP35]], i8 [[B_SROA_15_8_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP37:%.*]] = insertelement <8 x i8> [[TMP36]], i8 [[B_SROA_14_8_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP38:%.*]] = insertelement <8 x i8> [[TMP37]], i8 [[B_SROA_13_8_EXTRACT_TRUNC]], i64 4
+; SSE4-NEXT:    [[TMP39:%.*]] = insertelement <8 x i8> [[TMP38]], i8 [[B_SROA_12_8_EXTRACT_TRUNC]], i64 5
+; SSE4-NEXT:    [[TMP40:%.*]] = insertelement <8 x i8> [[TMP39]], i8 [[B_SROA_11_8_EXTRACT_TRUNC]], i64 6
+; SSE4-NEXT:    [[TMP41:%.*]] = insertelement <8 x i8> [[TMP40]], i8 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 7
+; SSE4-NEXT:    [[TMP42:%.*]] = lshr <8 x i8> [[TMP41]], splat (i8 1)
+; SSE4-NEXT:    [[TMP43:%.*]] = add nuw <8 x i8> [[TMP42]], [[TMP33]]
+; SSE4-NEXT:    [[TMP44:%.*]] = or <8 x i8> [[TMP41]], [[TMP32]]
+; SSE4-NEXT:    [[TMP45:%.*]] = and <8 x i8> [[TMP44]], splat (i8 1)
+; SSE4-NEXT:    [[TMP46:%.*]] = add nuw <8 x i8> [[TMP43]], [[TMP45]]
+; SSE4-NEXT:    [[TMP47:%.*]] = zext <8 x i8> [[TMP46]] to <8 x i64>
+; SSE4-NEXT:    [[TMP48:%.*]] = shl nuw <8 x i64> [[TMP47]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; SSE4-NEXT:    [[TMP49:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP48]])
+; SSE4-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP49]], 1
+; SSE4-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; AVX-LABEL: @avgr_16_u8_alt(
+; AVX-NEXT:  entry:
+; AVX-NEXT:    [[TMP0:%.*]] = insertelement <8 x i64> poison, i64 [[A_COERCE0:%.*]], i64 0
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP0]], <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP2:%.*]] = lshr <8 x i64> [[TMP1]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; AVX-NEXT:    [[TMP3:%.*]] = trunc <8 x i64> [[TMP2]] to <8 x i8>
+; AVX-NEXT:    [[TMP4:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
+; AVX-NEXT:    [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP6:%.*]] = lshr <8 x i64> [[TMP5]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; AVX-NEXT:    [[TMP7:%.*]] = trunc <8 x i64> [[TMP6]] to <8 x i8>
+; AVX-NEXT:    [[TMP8:%.*]] = lshr <8 x i8> [[TMP3]], splat (i8 1)
+; AVX-NEXT:    [[TMP9:%.*]] = lshr <8 x i8> [[TMP7]], splat (i8 1)
+; AVX-NEXT:    [[TMP10:%.*]] = add nuw <8 x i8> [[TMP9]], [[TMP8]]
+; AVX-NEXT:    [[TMP11:%.*]] = or <8 x i8> [[TMP7]], [[TMP3]]
+; AVX-NEXT:    [[TMP12:%.*]] = and <8 x i8> [[TMP11]], splat (i8 1)
+; AVX-NEXT:    [[TMP13:%.*]] = add nuw <8 x i8> [[TMP10]], [[TMP12]]
+; AVX-NEXT:    [[TMP14:%.*]] = zext <8 x i8> [[TMP13]] to <8 x i64>
+; AVX-NEXT:    [[TMP15:%.*]] = shl nuw <8 x i64> [[TMP14]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; AVX-NEXT:    [[TMP16:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP15]])
+; AVX-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0
+; AVX-NEXT:    [[TMP17:%.*]] = insertelement <8 x i64> poison, i64 [[A_COERCE1:%.*]], i64 0
+; AVX-NEXT:    [[TMP18:%.*]] = shufflevector <8 x i64> [[TMP17]], <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP19:%.*]] = lshr <8 x i64> [[TMP18]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; AVX-NEXT:    [[TMP20:%.*]] = trunc <8 x i64> [[TMP19]] to <8 x i8>
+; AVX-NEXT:    [[TMP21:%.*]] = insertelement <8 x i64> poison, i64 [[B_COERCE1:%.*]], i64 0
+; AVX-NEXT:    [[TMP22:%.*]] = shufflevector <8 x i64> [[TMP21]], <8 x i64> poison, <8 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP23:%.*]] = lshr <8 x i64> [[TMP22]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; AVX-NEXT:    [[TMP24:%.*]] = trunc <8 x i64> [[TMP23]] to <8 x i8>
+; AVX-NEXT:    [[TMP25:%.*]] = lshr <8 x i8> [[TMP20]], splat (i8 1)
+; AVX-NEXT:    [[TMP26:%.*]] = lshr <8 x i8> [[TMP24]], splat (i8 1)
+; AVX-NEXT:    [[TMP27:%.*]] = add nuw <8 x i8> [[TMP26]], [[TMP25]]
+; AVX-NEXT:    [[TMP28:%.*]] = or <8 x i8> [[TMP24]], [[TMP20]]
+; AVX-NEXT:    [[TMP29:%.*]] = and <8 x i8> [[TMP28]], splat (i8 1)
+; AVX-NEXT:    [[TMP30:%.*]] = add nuw <8 x i8> [[TMP27]], [[TMP29]]
+; AVX-NEXT:    [[TMP31:%.*]] = zext <8 x i8> [[TMP30]] to <8 x i64>
+; AVX-NEXT:    [[TMP32:%.*]] = shl nuw <8 x i64> [[TMP31]], <i64 56, i64 48, i64 40, i64 32, i64 24, i64 16, i64 8, i64 0>
+; AVX-NEXT:    [[TMP33:%.*]] = tail call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP32]])
+; AVX-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1
+; AVX-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+entry:
+  %retval = alloca %"struct.std::array16", align 1
+  %a = alloca %"struct.std::array16", align 1
+  %b = alloca %"struct.std::array16", align 1
+  store i64 %a.coerce0, ptr %a, align 1
+  %0 = getelementptr inbounds nuw i8, ptr %a, i64 8
+  store i64 %a.coerce1, ptr %0, align 1
+  store i64 %b.coerce0, ptr %b, align 1
+  %1 = getelementptr inbounds nuw i8, ptr %b, i64 8
+  store i64 %b.coerce1, ptr %1, align 1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp samesign ult i64 %i.0, 16
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %.fca.0.load = load i64, ptr %retval, align 1
+  %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
+  %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
+  %.fca.1.load = load i64, ptr %.fca.1.gep, align 1
+  %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
+  ret { i64, i64 } %.fca.1.insert
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx.i = getelementptr inbounds nuw i8, ptr %a, i64 %i.0
+  %2 = load i8, ptr %arrayidx.i, align 1
+  %arrayidx.i22 = getelementptr inbounds nuw i8, ptr %b, i64 %i.0
+  %3 = load i8, ptr %arrayidx.i22, align 1
+  %shr = lshr i8 %2, 1
+  %shr5 = lshr i8 %3, 1
+  %narrow = add nuw i8 %shr, %shr5
+  %or21 = or i8 %2, %3
+  %4 = and i8 %or21, 1
+  %add12 = add i8 %narrow, %4
+  %arrayidx.i23 = getelementptr inbounds nuw i8, ptr %retval, i64 %i.0
+  store i8 %add12, ptr %arrayidx.i23, align 1
+  %inc = add nuw nsw i64 %i.0, 1
+  br label %for.cond
+}
+
+define { i64, i64 } @avgr_8_u16(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
+; SSE2-LABEL: @avgr_8_u16(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
+; SSE2-NEXT:    [[TMP1:%.*]] = lshr i32 [[TMP0]], 16
+; SSE2-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; SSE2-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE2-NEXT:    [[TMP2:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
+; SSE2-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 16
+; SSE2-NEXT:    [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; SSE2-NEXT:    [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE2-NEXT:    [[TMP4:%.*]] = trunc i64 [[B_COERCE0:%.*]] to i32
+; SSE2-NEXT:    [[TMP5:%.*]] = lshr i32 [[TMP4]], 16
+; SSE2-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; SSE2-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE2-NEXT:    [[TMP6:%.*]] = trunc i64 [[B_COERCE1:%.*]] to i32
+; SSE2-NEXT:    [[TMP7:%.*]] = lshr i32 [[TMP6]], 16
+; SSE2-NEXT:    [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; SSE2-NEXT:    [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE2-NEXT:    [[CONV:%.*]] = and i64 [[A_COERCE0]], 65535
+; SSE2-NEXT:    [[CONV2:%.*]] = and i64 [[B_COERCE0]], 65535
+; SSE2-NEXT:    [[ADD:%.*]] = add nuw nsw i64 [[CONV]], 1
+; SSE2-NEXT:    [[ADD3:%.*]] = add nuw nsw i64 [[ADD]], [[CONV2]]
+; SSE2-NEXT:    [[SHR:%.*]] = lshr i64 [[ADD3]], 1
+; SSE2-NEXT:    [[ADD_1:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; SSE2-NEXT:    [[ADD3_1:%.*]] = add nuw nsw i32 [[ADD_1]], [[TMP5]]
+; SSE2-NEXT:    [[CONV_2:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535
+; SSE2-NEXT:    [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535
+; SSE2-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[CONV_2]], 1
+; SSE2-NEXT:    [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_2]], [[CONV2_2]]
+; SSE2-NEXT:    [[ADD_3:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1
+; SSE2-NEXT:    [[ADD3_3:%.*]] = add nuw nsw i64 [[ADD_3]], [[B_SROA_4_0_EXTRACT_SHIFT]]
+; SSE2-NEXT:    [[CONV_4:%.*]] = and i64 [[A_COERCE1]], 65535
+; SSE2-NEXT:    [[CONV2_4:%.*]] = and i64 [[B_COERCE1]], 65535
+; SSE2-NEXT:    [[ADD_4:%.*]] = add nuw nsw i64 [[CONV_4]], 1
+; SSE2-NEXT:    [[ADD3_4:%.*]] = add nuw nsw i64 [[ADD_4]], [[CONV2_4]]
+; SSE2-NEXT:    [[SHR_4:%.*]] = lshr i64 [[ADD3_4]], 1
+; SSE2-NEXT:    [[ADD_5:%.*]] = add nuw nsw i32 [[TMP3]], 1
+; SSE2-NEXT:    [[ADD3_5:%.*]] = add nuw nsw i32 [[ADD_5]], [[TMP7]]
+; SSE2-NEXT:    [[CONV_6:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535
+; SSE2-NEXT:    [[CONV2_6:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535
+; SSE2-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[CONV_6]], 1
+; SSE2-NEXT:    [[ADD3_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV2_6]]
+; SSE2-NEXT:    [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1
+; SSE2-NEXT:    [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_9_8_EXTRACT_SHIFT]]
+; SSE2-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[ADD3_3]], 47
+; SSE2-NEXT:    [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = and i64 [[TMP8]], -281474976710656
+; SSE2-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[ADD3_2]], 31
+; SSE2-NEXT:    [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = and i64 [[TMP9]], 281470681743360
+; SSE2-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_EXT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[TMP10:%.*]] = shl nuw i32 [[ADD3_1]], 15
+; SSE2-NEXT:    [[TMP11:%.*]] = and i32 [[TMP10]], -65536
+; SSE2-NEXT:    [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = zext i32 [[TMP11]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[SHR]]
+; SSE2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0
+; SSE2-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[ADD3_7]], 47
+; SSE2-NEXT:    [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = and i64 [[TMP12]], -281474976710656
+; SSE2-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[ADD3_6]], 31
+; SSE2-NEXT:    [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = and i64 [[TMP13]], 281470681743360
+; SSE2-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_EXT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[TMP14:%.*]] = shl nuw i32 [[ADD3_5]], 15
+; SSE2-NEXT:    [[TMP15:%.*]] = and i32 [[TMP14]], -65536
+; SSE2-NEXT:    [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = zext i32 [[TMP15]] to i64
+; SSE2-NEXT:    [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]]
+; SSE2-NEXT:    [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[SHR_4]]
+; SSE2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1
+; SSE2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; SSE4-LABEL: @avgr_8_u16(
+; SSE4-NEXT:  entry:
+; SSE4-NEXT:    [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
+; SSE4-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; SSE4-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE4-NEXT:    [[TMP1:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
+; SSE4-NEXT:    [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; SSE4-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
+; SSE4-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[B_COERCE1:%.*]], i64 1
+; SSE4-NEXT:    [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
+; SSE4-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; SSE4-NEXT:    [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; SSE4-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
+; SSE4-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[A_COERCE1]], i64 1
+; SSE4-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
+; SSE4-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535)
+; SSE4-NEXT:    [[CONV_6:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535
+; SSE4-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+; SSE4-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1]], i64 1
+; SSE4-NEXT:    [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16)
+; SSE4-NEXT:    [[TMP12:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16)
+; SSE4-NEXT:    [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE4-NEXT:    [[CONV_2:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535
+; SSE4-NEXT:    [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE4-NEXT:    [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535
+; SSE4-NEXT:    [[TMP13:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1)
+; SSE4-NEXT:    [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP13]], [[TMP8]]
+; SSE4-NEXT:    [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1)
+; SSE4-NEXT:    [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1)
+; SSE4-NEXT:    [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP12]]
+; SSE4-NEXT:    [[CONV2_6:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535
+; SSE4-NEXT:    [[ADD_3:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1
+; SSE4-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[CONV_6]], 1
+; SSE4-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE4-NEXT:    [[ADD3_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV2_6]]
+; SSE4-NEXT:    [[ADD3_3:%.*]] = add nuw nsw i64 [[ADD_3]], [[B_SROA_4_0_EXTRACT_SHIFT]]
+; SSE4-NEXT:    [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1
+; SSE4-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[CONV_2]], 1
+; SSE4-NEXT:    [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_9_8_EXTRACT_SHIFT]]
+; SSE4-NEXT:    [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_2]], [[CONV2_2]]
+; SSE4-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[ADD3_7]], 47
+; SSE4-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[ADD3_2]], 31
+; SSE4-NEXT:    [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = and i64 [[TMP18]], -281474976710656
+; SSE4-NEXT:    [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = and i64 [[TMP19]], 281470681743360
+; SSE4-NEXT:    [[TMP20:%.*]] = shl nuw nsw i64 [[ADD3_6]], 31
+; SSE4-NEXT:    [[TMP21:%.*]] = shl nuw i64 [[ADD3_3]], 47
+; SSE4-NEXT:    [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = and i64 [[TMP20]], 281470681743360
+; SSE4-NEXT:    [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = and i64 [[TMP21]], -281474976710656
+; SSE4-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_EXT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
+; SSE4-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_EXT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
+; SSE4-NEXT:    [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15)
+; SSE4-NEXT:    [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536)
+; SSE4-NEXT:    [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64>
+; SSE4-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], i64 0
+; SSE4-NEXT:    [[TMP26:%.*]] = insertelement <2 x i64> [[TMP25]], i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], i64 1
+; SSE4-NEXT:    [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]]
+; SSE4-NEXT:    [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]]
+; SSE4-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0
+; SSE4-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0
+; SSE4-NEXT:    [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1
+; SSE4-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1
+; SSE4-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; AVX2-LABEL: @avgr_8_u16(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
+; AVX2-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; AVX2-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; AVX2-NEXT:    [[TMP1:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
+; AVX2-NEXT:    [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[B_COERCE1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
+; AVX2-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; AVX2-NEXT:    [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; AVX2-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[A_COERCE1]], i64 1
+; AVX2-NEXT:    [[TMP7:%.*]] = and <2 x i64> [[TMP6]], splat (i64 65535)
+; AVX2-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP3]], splat (i64 65535)
+; AVX2-NEXT:    [[CONV_6:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535
+; AVX2-NEXT:    [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP1]], i64 1
+; AVX2-NEXT:    [[TMP11:%.*]] = lshr <2 x i32> [[TMP10]], splat (i32 16)
+; AVX2-NEXT:    [[TMP12:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 16)
+; AVX2-NEXT:    [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; AVX2-NEXT:    [[CONV_2:%.*]] = and i64 [[A_SROA_3_0_EXTRACT_SHIFT]], 65535
+; AVX2-NEXT:    [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; AVX2-NEXT:    [[CONV2_2:%.*]] = and i64 [[B_SROA_3_0_EXTRACT_SHIFT]], 65535
+; AVX2-NEXT:    [[TMP13:%.*]] = add nuw nsw <2 x i64> [[TMP7]], splat (i64 1)
+; AVX2-NEXT:    [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP13]], [[TMP8]]
+; AVX2-NEXT:    [[TMP15:%.*]] = lshr <2 x i64> [[TMP14]], splat (i64 1)
+; AVX2-NEXT:    [[TMP16:%.*]] = add nuw nsw <2 x i32> [[TMP11]], splat (i32 1)
+; AVX2-NEXT:    [[TMP17:%.*]] = add nuw nsw <2 x i32> [[TMP16]], [[TMP12]]
+; AVX2-NEXT:    [[CONV2_6:%.*]] = and i64 [[B_SROA_8_8_EXTRACT_SHIFT]], 65535
+; AVX2-NEXT:    [[ADD_3:%.*]] = add nuw nsw i64 [[A_SROA_4_0_EXTRACT_SHIFT]], 1
+; AVX2-NEXT:    [[ADD_6:%.*]] = add nuw nsw i64 [[CONV_6]], 1
+; AVX2-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; AVX2-NEXT:    [[ADD3_6:%.*]] = add nuw nsw i64 [[ADD_6]], [[CONV2_6]]
+; AVX2-NEXT:    [[ADD3_3:%.*]] = add nuw nsw i64 [[ADD_3]], [[B_SROA_4_0_EXTRACT_SHIFT]]
+; AVX2-NEXT:    [[ADD_7:%.*]] = add nuw nsw i64 [[A_SROA_9_8_EXTRACT_SHIFT]], 1
+; AVX2-NEXT:    [[ADD_2:%.*]] = add nuw nsw i64 [[CONV_2]], 1
+; AVX2-NEXT:    [[ADD3_7:%.*]] = add nuw nsw i64 [[ADD_7]], [[B_SROA_9_8_EXTRACT_SHIFT]]
+; AVX2-NEXT:    [[ADD3_2:%.*]] = add nuw nsw i64 [[ADD_2]], [[CONV2_2]]
+; AVX2-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[ADD3_7]], 47
+; AVX2-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[ADD3_2]], 31
+; AVX2-NEXT:    [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = and i64 [[TMP18]], -281474976710656
+; AVX2-NEXT:    [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = and i64 [[TMP19]], 281470681743360
+; AVX2-NEXT:    [[TMP20:%.*]] = shl nuw nsw i64 [[ADD3_6]], 31
+; AVX2-NEXT:    [[TMP21:%.*]] = shl nuw i64 [[ADD3_3]], 47
+; AVX2-NEXT:    [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = and i64 [[TMP20]], 281470681743360
+; AVX2-NEXT:    [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = and i64 [[TMP21]], -281474976710656
+; AVX2-NEXT:    [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_9_8_INSERT_EXT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]]
+; AVX2-NEXT:    [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or disjoint i64 [[RETVAL_SROA_4_0_INSERT_EXT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]]
+; AVX2-NEXT:    [[TMP22:%.*]] = shl nuw <2 x i32> [[TMP17]], splat (i32 15)
+; AVX2-NEXT:    [[TMP23:%.*]] = and <2 x i32> [[TMP22]], splat (i32 -65536)
+; AVX2-NEXT:    [[TMP24:%.*]] = zext <2 x i32> [[TMP23]] to <2 x i64>
+; AVX2-NEXT:    [[TMP25:%.*]] = insertelement <2 x i64> poison, i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], i64 0
+; AVX2-NEXT:    [[TMP26:%.*]] = insertelement <2 x i64> [[TMP25]], i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], i64 1
+; AVX2-NEXT:    [[TMP27:%.*]] = or disjoint <2 x i64> [[TMP26]], [[TMP24]]
+; AVX2-NEXT:    [[TMP28:%.*]] = or disjoint <2 x i64> [[TMP27]], [[TMP15]]
+; AVX2-NEXT:    [[TMP29:%.*]] = extractelement <2 x i64> [[TMP28]], i64 0
+; AVX2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP29]], 0
+; AVX2-NEXT:    [[TMP30:%.*]] = extractelement <2 x i64> [[TMP28]], i64 1
+; AVX2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP30]], 1
+; AVX2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; AVX512-LABEL: @avgr_8_u16(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = trunc i64 [[A_COERCE0:%.*]] to i32
+; AVX512-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[A_COERCE0]], i64 0
+; AVX512-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[B_COERCE1:%.*]], i64 1
+; AVX512-NEXT:    [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], <i64 48, i64 32>
+; AVX512-NEXT:    [[TMP4:%.*]] = trunc i64 [[A_COERCE1:%.*]] to i32
+; AVX512-NEXT:    [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; AVX512-NEXT:    [[TMP5:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[B_COERCE0:%.*]], i64 0
+; AVX512-NEXT:    [[TMP6:%.*]] = trunc <2 x i64> [[TMP5]] to <2 x i32>
+; AVX512-NEXT:    [[TMP7:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[A_COERCE1]], i64 1
+; AVX512-NEXT:    [[TMP8:%.*]] = and <2 x i64> [[TMP7]], splat (i64 65535)
+; AVX512-NEXT:    [[TMP9:%.*]] = and <2 x i64> [[TMP5]], splat (i64 65535)
+; AVX512-NEXT:    [[CONV_6:%.*]] = and i64 [[A_SROA_8_8_EXTRACT_SHIFT]], 65535
+; AVX512-NEXT:    [[TMP10:%.*]] = lshr <2 x i64> [[TMP7]], <i64 32, i64 0>
+; AVX512-NEXT:    [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP0]], i64 0
+; AVX512-NEXT:    [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP4]], i64 1
+; AVX512-NEXT:    [[TMP13:%.*]] = lshr <2 x i32> [[TMP12]], splat (i32 16)
+; AVX512-NEXT:    [[TMP14:%.*]] = lshr <2 x i64> [[TMP5]], <i64 32, i64 0>
+; AVX512-NEXT:    [[TMP15:%.*]] = lshr <2 x i32> [[TMP6]], splat (i32 16)
+; AVX512-NEXT:    [[TMP16:%.*]] = and <2 x i64> [[TMP10]], <i64 65535, i64 poison>
+; AVX512-NEXT:    [[TMP17:%.*]] = lshr <2 x i64> [[TMP10]], <i64 65535, i64 48>
+; AVX512-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP17]], <2 x i32> <i32 0, i32 3>
+; AVX512-NEXT:    [[TMP19:%.*]] = and <2 x i64> [[TMP14]], <i64 65535, i64 poison>
+; AVX512-NEXT:    [[TMP20:%.*]] = lshr <2 x i64> [[TMP14]], <i64 65535, i64 48>
+; AVX512-NEXT:    [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP19]], <2 x i64> [[TMP20]], <2 x i32> <i32 0, i32 3>
+; AVX512-NEXT:    [[TMP22:%.*]] = add nuw nsw <2 x i64> [[TMP8]], splat (i64 1)
+; AVX512-NEXT:    [[TMP23:%.*]] = add nuw nsw <2 x i64> [[TMP22]], [[TMP9]]
+; AVX512-NEXT:    [[TMP24:%.*]] = lshr <2 x i64> [[TMP23]], splat (i64 1)
+; AVX512-NEXT:    [[TMP25:%.*]] = add nuw nsw <2 x i32> [[TMP13]], splat (i32 1)
+; AVX512-NEXT:    [[TMP26:%.*]] = add nuw nsw <2 x i32> [[TMP25]], [[TMP15]]
+; AVX512-NEXT:    [[TMP27:%.*]] = add nuw nsw <2 x i64> [[TMP3]], <i64 1, i64 poison>
+; AVX512-NEXT:    [[TMP28:%.*]] = and <2 x i64> [[TMP3]], <i64 poison, i64 65535>
+; AVX512-NEXT:    [[TMP29:%.*]] = shufflevector <2 x i64> [[TMP27]], <2 x i64> [[TMP28]], <2 x i32> <i32 0, i32 3>
+; AVX512-NEXT:    [[TMP30:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[CONV_6]], i64 1
+; AVX512-NEXT:    [[TMP31:%.*]] = lshr <2 x i64> [[TMP30]], <i64 48, i64 1>
+; AVX512-NEXT:    [[TMP32:%.*]] = add nuw nsw <2 x i64> [[TMP30]], <i64 poison, i64 1>
+; AVX512-NEXT:    [[TMP33:%.*]] = shufflevector <2 x i64> [[TMP31]], <2 x i64> [[TMP32]], <2 x i32> <i32 0, i32 3>
+; AVX512-NEXT:    [[TMP34:%.*]] = add nuw nsw <2 x i64> [[TMP29]], [[TMP33]]
+; AVX512-NEXT:    [[TMP35:%.*]] = add nuw nsw <2 x i64> [[TMP18]], splat (i64 1)
+; AVX512-NEXT:    [[TMP36:%.*]] = add nuw nsw <2 x i64> [[TMP35]], [[TMP21]]
+; AVX512-NEXT:    [[TMP37:%.*]] = shl nuw <2 x i64> [[TMP36]], <i64 31, i64 47>
+; AVX512-NEXT:    [[TMP38:%.*]] = and <2 x i64> [[TMP37]], <i64 281470681743360, i64 -281474976710656>
+; AVX512-NEXT:    [[TMP39:%.*]] = shl nuw <2 x i64> [[TMP34]], <i64 47, i64 31>
+; AVX512-NEXT:    [[TMP40:%.*]] = and <2 x i64> [[TMP39]], <i64 -281474976710656, i64 281470681743360>
+; AVX512-NEXT:    [[TMP41:%.*]] = or disjoint <2 x i64> [[TMP38]], [[TMP40]]
+; AVX512-NEXT:    [[TMP42:%.*]] = shl nuw <2 x i32> [[TMP26]], splat (i32 15)
+; AVX512-NEXT:    [[TMP43:%.*]] = and <2 x i32> [[TMP42]], splat (i32 -65536)
+; AVX512-NEXT:    [[TMP44:%.*]] = zext <2 x i32> [[TMP43]] to <2 x i64>
+; AVX512-NEXT:    [[TMP45:%.*]] = or disjoint <2 x i64> [[TMP41]], [[TMP44]]
+; AVX512-NEXT:    [[TMP46:%.*]] = or disjoint <2 x i64> [[TMP45]], [[TMP24]]
+; AVX512-NEXT:    [[TMP47:%.*]] = extractelement <2 x i64> [[TMP46]], i64 0
+; AVX512-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP47]], 0
+; AVX512-NEXT:    [[TMP48:%.*]] = extractelement <2 x i64> [[TMP46]], i64 1
+; AVX512-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP48]], 1
+; AVX512-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+entry:
+  %retval = alloca %"struct.std::array8", align 2
+  %a = alloca %"struct.std::array8", align 2
+  %b = alloca %"struct.std::array8", align 2
+  store i64 %a.coerce0, ptr %a, align 2
+  %0 = getelementptr inbounds nuw i8, ptr %a, i64 8
+  store i64 %a.coerce1, ptr %0, align 2
+  store i64 %b.coerce0, ptr %b, align 2
+  %1 = getelementptr inbounds nuw i8, ptr %b, i64 8
+  store i64 %b.coerce1, ptr %1, align 2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp samesign ult i64 %i.0, 8
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %.fca.0.load = load i64, ptr %retval, align 2
+  %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
+  %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
+  %.fca.1.load = load i64, ptr %.fca.1.gep, align 2
+  %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
+  ret { i64, i64 } %.fca.1.insert
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx.i = getelementptr inbounds nuw [2 x i8], ptr %a, i64 %i.0
+  %2 = load i16, ptr %arrayidx.i, align 2
+  %conv = zext i16 %2 to i32
+  %arrayidx.i10 = getelementptr inbounds nuw [2 x i8], ptr %b, i64 %i.0
+  %3 = load i16, ptr %arrayidx.i10, align 2
+  %conv2 = zext i16 %3 to i32
+  %add = add nuw nsw i32 %conv, %conv2
+  %add3 = add nuw nsw i32 %add, 1
+  %shr = lshr i32 %add3, 1
+  %conv4 = trunc nuw i32 %shr to i16
+  %arrayidx.i11 = getelementptr inbounds nuw [2 x i8], ptr %retval, i64 %i.0
+  store i16 %conv4, ptr %arrayidx.i11, align 2
+  %inc = add nuw nsw i64 %i.0, 1
+  br label %for.cond
+}
+
+define { i64, i64 } @avgr_8_u16_alt(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1)  {
+; SSE2-LABEL: @avgr_8_u16_alt(
+; SSE2-NEXT:  entry:
+; SSE2-NEXT:    [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 16
+; SSE2-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; SSE2-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE2-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i16
+; SSE2-NEXT:    [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 16
+; SSE2-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; SSE2-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE2-NEXT:    [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i16
+; SSE2-NEXT:    [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[SHR:%.*]] = lshr i16 [[A_SROA_0_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR_1:%.*]] = lshr i16 [[A_SROA_2_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR_2:%.*]] = lshr i16 [[A_SROA_3_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR_3:%.*]] = lshr i16 [[A_SROA_4_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5:%.*]] = lshr i16 [[B_SROA_0_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_1:%.*]] = lshr i16 [[B_SROA_2_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_2:%.*]] = lshr i16 [[B_SROA_3_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_3:%.*]] = lshr i16 [[B_SROA_4_0_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW:%.*]] = add nuw i16 [[SHR5]], [[SHR]]
+; SSE2-NEXT:    [[NARROW_1:%.*]] = add nuw i16 [[SHR5_1]], [[SHR_1]]
+; SSE2-NEXT:    [[NARROW_2:%.*]] = add nuw i16 [[SHR5_2]], [[SHR_2]]
+; SSE2-NEXT:    [[NARROW_3:%.*]] = add nuw i16 [[SHR5_3]], [[SHR_3]]
+; SSE2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 0
+; SSE2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 1
+; SSE2-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2
+; SSE2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 3
+; SSE2-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 0
+; SSE2-NEXT:    [[TMP5:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 1
+; SSE2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 2
+; SSE2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 3
+; SSE2-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP3]], [[TMP7]]
+; SSE2-NEXT:    [[TMP9:%.*]] = and <4 x i16> [[TMP8]], splat (i16 1)
+; SSE2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i16> poison, i16 [[NARROW_3]], i64 0
+; SSE2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[NARROW_2]], i64 1
+; SSE2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[NARROW_1]], i64 2
+; SSE2-NEXT:    [[TMP13:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[NARROW]], i64 3
+; SSE2-NEXT:    [[TMP14:%.*]] = add nuw <4 x i16> [[TMP13]], [[TMP9]]
+; SSE2-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i16> [[TMP14]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to i64
+; SSE2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0
+; SSE2-NEXT:    [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 16
+; SSE2-NEXT:    [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; SSE2-NEXT:    [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE2-NEXT:    [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i16
+; SSE2-NEXT:    [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[A_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_8_8_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 16
+; SSE2-NEXT:    [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; SSE2-NEXT:    [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE2-NEXT:    [[B_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i16
+; SSE2-NEXT:    [[B_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16
+; SSE2-NEXT:    [[SHR_4:%.*]] = lshr i16 [[A_SROA_5_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR_5:%.*]] = lshr i16 [[A_SROA_7_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR_6:%.*]] = lshr i16 [[A_SROA_8_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR_7:%.*]] = lshr i16 [[A_SROA_9_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_4:%.*]] = lshr i16 [[B_SROA_5_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_5:%.*]] = lshr i16 [[B_SROA_7_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_6:%.*]] = lshr i16 [[B_SROA_8_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[SHR5_7:%.*]] = lshr i16 [[B_SROA_9_8_EXTRACT_TRUNC]], 1
+; SSE2-NEXT:    [[NARROW_4:%.*]] = add nuw i16 [[SHR5_4]], [[SHR_4]]
+; SSE2-NEXT:    [[NARROW_5:%.*]] = add nuw i16 [[SHR5_5]], [[SHR_5]]
+; SSE2-NEXT:    [[NARROW_6:%.*]] = add nuw i16 [[SHR5_6]], [[SHR_6]]
+; SSE2-NEXT:    [[NARROW_7:%.*]] = add nuw i16 [[SHR5_7]], [[SHR_7]]
+; SSE2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 0
+; SSE2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 1
+; SSE2-NEXT:    [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[B_SROA_7_8_EXTRACT_TRUNC]], i64 2
+; SSE2-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP19]], i16 [[B_SROA_5_8_EXTRACT_TRUNC]], i64 3
+; SSE2-NEXT:    [[TMP21:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 0
+; SSE2-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> [[TMP21]], i16 [[A_SROA_8_8_EXTRACT_TRUNC]], i64 1
+; SSE2-NEXT:    [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2
+; SSE2-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3
+; SSE2-NEXT:    [[TMP25:%.*]] = or <4 x i16> [[TMP20]], [[TMP24]]
+; SSE2-NEXT:    [[TMP26:%.*]] = and <4 x i16> [[TMP25]], splat (i16 1)
+; SSE2-NEXT:    [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[NARROW_7]], i64 0
+; SSE2-NEXT:    [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[NARROW_6]], i64 1
+; SSE2-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[NARROW_5]], i64 2
+; SSE2-NEXT:    [[TMP30:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[NARROW_4]], i64 3
+; SSE2-NEXT:    [[TMP31:%.*]] = add nuw <4 x i16> [[TMP30]], [[TMP26]]
+; SSE2-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i16> [[TMP31]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE2-NEXT:    [[TMP33:%.*]] = bitcast <4 x i16> [[TMP32]] to i64
+; SSE2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1
+; SSE2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; SSE4-LABEL: @avgr_8_u16_alt(
+; SSE4-NEXT:  entry:
+; SSE4-NEXT:    [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 16
+; SSE4-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; SSE4-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; SSE4-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i16
+; SSE4-NEXT:    [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 16
+; SSE4-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; SSE4-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; SSE4-NEXT:    [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i16
+; SSE4-NEXT:    [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[SHR:%.*]] = lshr i16 [[A_SROA_0_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR_1:%.*]] = lshr i16 [[A_SROA_2_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR_2:%.*]] = lshr i16 [[A_SROA_3_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR_3:%.*]] = lshr i16 [[A_SROA_4_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5:%.*]] = lshr i16 [[B_SROA_0_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5_1:%.*]] = lshr i16 [[B_SROA_2_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5_2:%.*]] = lshr i16 [[B_SROA_3_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5_3:%.*]] = lshr i16 [[B_SROA_4_0_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[NARROW:%.*]] = add nuw i16 [[SHR5]], [[SHR]]
+; SSE4-NEXT:    [[NARROW_1:%.*]] = add nuw i16 [[SHR5_1]], [[SHR_1]]
+; SSE4-NEXT:    [[NARROW_2:%.*]] = add nuw i16 [[SHR5_2]], [[SHR_2]]
+; SSE4-NEXT:    [[NARROW_3:%.*]] = add nuw i16 [[SHR5_3]], [[SHR_3]]
+; SSE4-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP5:%.*]] = insertelement <4 x i16> [[TMP4]], i16 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP8:%.*]] = or <4 x i16> [[TMP3]], [[TMP7]]
+; SSE4-NEXT:    [[TMP9:%.*]] = and <4 x i16> [[TMP8]], splat (i16 1)
+; SSE4-NEXT:    [[TMP10:%.*]] = insertelement <4 x i16> poison, i16 [[NARROW_3]], i64 0
+; SSE4-NEXT:    [[TMP11:%.*]] = insertelement <4 x i16> [[TMP10]], i16 [[NARROW_2]], i64 1
+; SSE4-NEXT:    [[TMP12:%.*]] = insertelement <4 x i16> [[TMP11]], i16 [[NARROW_1]], i64 2
+; SSE4-NEXT:    [[TMP13:%.*]] = insertelement <4 x i16> [[TMP12]], i16 [[NARROW]], i64 3
+; SSE4-NEXT:    [[TMP14:%.*]] = add nuw <4 x i16> [[TMP13]], [[TMP9]]
+; SSE4-NEXT:    [[TMP15:%.*]] = shufflevector <4 x i16> [[TMP14]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE4-NEXT:    [[TMP16:%.*]] = bitcast <4 x i16> [[TMP15]] to i64
+; SSE4-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP16]], 0
+; SSE4-NEXT:    [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 16
+; SSE4-NEXT:    [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; SSE4-NEXT:    [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; SSE4-NEXT:    [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i16
+; SSE4-NEXT:    [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[A_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_8_8_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 16
+; SSE4-NEXT:    [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; SSE4-NEXT:    [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; SSE4-NEXT:    [[B_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i16
+; SSE4-NEXT:    [[B_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16
+; SSE4-NEXT:    [[SHR_4:%.*]] = lshr i16 [[A_SROA_5_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR_5:%.*]] = lshr i16 [[A_SROA_7_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR_6:%.*]] = lshr i16 [[A_SROA_8_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR_7:%.*]] = lshr i16 [[A_SROA_9_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5_4:%.*]] = lshr i16 [[B_SROA_5_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5_5:%.*]] = lshr i16 [[B_SROA_7_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5_6:%.*]] = lshr i16 [[B_SROA_8_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[SHR5_7:%.*]] = lshr i16 [[B_SROA_9_8_EXTRACT_TRUNC]], 1
+; SSE4-NEXT:    [[NARROW_4:%.*]] = add nuw i16 [[SHR5_4]], [[SHR_4]]
+; SSE4-NEXT:    [[NARROW_5:%.*]] = add nuw i16 [[SHR5_5]], [[SHR_5]]
+; SSE4-NEXT:    [[NARROW_6:%.*]] = add nuw i16 [[SHR5_6]], [[SHR_6]]
+; SSE4-NEXT:    [[NARROW_7:%.*]] = add nuw i16 [[SHR5_7]], [[SHR_7]]
+; SSE4-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[B_SROA_7_8_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP20:%.*]] = insertelement <4 x i16> [[TMP19]], i16 [[B_SROA_5_8_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP21:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 0
+; SSE4-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> [[TMP21]], i16 [[A_SROA_8_8_EXTRACT_TRUNC]], i64 1
+; SSE4-NEXT:    [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2
+; SSE4-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3
+; SSE4-NEXT:    [[TMP25:%.*]] = or <4 x i16> [[TMP20]], [[TMP24]]
+; SSE4-NEXT:    [[TMP26:%.*]] = and <4 x i16> [[TMP25]], splat (i16 1)
+; SSE4-NEXT:    [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[NARROW_7]], i64 0
+; SSE4-NEXT:    [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[NARROW_6]], i64 1
+; SSE4-NEXT:    [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[NARROW_5]], i64 2
+; SSE4-NEXT:    [[TMP30:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[NARROW_4]], i64 3
+; SSE4-NEXT:    [[TMP31:%.*]] = add nuw <4 x i16> [[TMP30]], [[TMP26]]
+; SSE4-NEXT:    [[TMP32:%.*]] = shufflevector <4 x i16> [[TMP31]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SSE4-NEXT:    [[TMP33:%.*]] = bitcast <4 x i16> [[TMP32]] to i64
+; SSE4-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP33]], 1
+; SSE4-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; AVX2-LABEL: @avgr_8_u16_alt(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[A_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0:%.*]], 16
+; AVX2-NEXT:    [[A_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 32
+; AVX2-NEXT:    [[A_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE0]], 48
+; AVX2-NEXT:    [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE0]] to i16
+; AVX2-NEXT:    [[A_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_2_0_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_3_0_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[A_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_4_0_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0:%.*]], 16
+; AVX2-NEXT:    [[B_SROA_3_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 32
+; AVX2-NEXT:    [[B_SROA_4_0_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE0]], 48
+; AVX2-NEXT:    [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE0]] to i16
+; AVX2-NEXT:    [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_2_0_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_3_0_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[B_SROA_4_0_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_4_0_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_4_0_EXTRACT_TRUNC]], i64 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> [[TMP0]], i16 [[A_SROA_3_0_EXTRACT_TRUNC]], i64 1
+; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[A_SROA_2_0_EXTRACT_TRUNC]], i64 2
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[A_SROA_0_0_EXTRACT_TRUNC]], i64 3
+; AVX2-NEXT:    [[TMP4:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1)
+; AVX2-NEXT:    [[TMP5:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_4_0_EXTRACT_TRUNC]], i64 0
+; AVX2-NEXT:    [[TMP6:%.*]] = insertelement <4 x i16> [[TMP5]], i16 [[B_SROA_3_0_EXTRACT_TRUNC]], i64 1
+; AVX2-NEXT:    [[TMP7:%.*]] = insertelement <4 x i16> [[TMP6]], i16 [[B_SROA_2_0_EXTRACT_TRUNC]], i64 2
+; AVX2-NEXT:    [[TMP8:%.*]] = insertelement <4 x i16> [[TMP7]], i16 [[B_SROA_0_0_EXTRACT_TRUNC]], i64 3
+; AVX2-NEXT:    [[TMP9:%.*]] = lshr <4 x i16> [[TMP8]], splat (i16 1)
+; AVX2-NEXT:    [[TMP10:%.*]] = add nuw <4 x i16> [[TMP9]], [[TMP4]]
+; AVX2-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP8]], [[TMP3]]
+; AVX2-NEXT:    [[TMP12:%.*]] = and <4 x i16> [[TMP11]], splat (i16 1)
+; AVX2-NEXT:    [[TMP13:%.*]] = add nuw <4 x i16> [[TMP10]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; AVX2-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP15]], 0
+; AVX2-NEXT:    [[A_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1:%.*]], 16
+; AVX2-NEXT:    [[A_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 32
+; AVX2-NEXT:    [[A_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[A_COERCE1]], 48
+; AVX2-NEXT:    [[A_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_COERCE1]] to i16
+; AVX2-NEXT:    [[A_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_7_8_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[A_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_8_8_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[A_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[A_SROA_9_8_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[B_SROA_7_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1:%.*]], 16
+; AVX2-NEXT:    [[B_SROA_8_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 32
+; AVX2-NEXT:    [[B_SROA_9_8_EXTRACT_SHIFT:%.*]] = lshr i64 [[B_COERCE1]], 48
+; AVX2-NEXT:    [[B_SROA_5_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_COERCE1]] to i16
+; AVX2-NEXT:    [[B_SROA_7_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_7_8_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[B_SROA_8_8_EXTRACT_TRUNC:%.*]] = trunc i64 [[B_SROA_8_8_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[B_SROA_9_8_EXTRACT_TRUNC:%.*]] = trunc nuw i64 [[B_SROA_9_8_EXTRACT_SHIFT]] to i16
+; AVX2-NEXT:    [[TMP16:%.*]] = insertelement <4 x i16> poison, i16 [[A_SROA_9_8_EXTRACT_TRUNC]], i64 0
+; AVX2-NEXT:    [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[A_SROA_8_8_EXTRACT_TRUNC]], i64 1
+; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[A_SROA_7_8_EXTRACT_TRUNC]], i64 2
+; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[A_SROA_5_8_EXTRACT_TRUNC]], i64 3
+; AVX2-NEXT:    [[TMP20:%.*]] = lshr <4 x i16> [[TMP19]], splat (i16 1)
+; AVX2-NEXT:    [[TMP21:%.*]] = insertelement <4 x i16> poison, i16 [[B_SROA_9_8_EXTRACT_TRUNC]], i64 0
+; AVX2-NEXT:    [[TMP22:%.*]] = insertelement <4 x i16> [[TMP21]], i16 [[B_SROA_8_8_EXTRACT_TRUNC]], i64 1
+; AVX2-NEXT:    [[TMP23:%.*]] = insertelement <4 x i16> [[TMP22]], i16 [[B_SROA_7_8_EXTRACT_TRUNC]], i64 2
+; AVX2-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> [[TMP23]], i16 [[B_SROA_5_8_EXTRACT_TRUNC]], i64 3
+; AVX2-NEXT:    [[TMP25:%.*]] = lshr <4 x i16> [[TMP24]], splat (i16 1)
+; AVX2-NEXT:    [[TMP26:%.*]] = add nuw <4 x i16> [[TMP25]], [[TMP20]]
+; AVX2-NEXT:    [[TMP27:%.*]] = or <4 x i16> [[TMP24]], [[TMP19]]
+; AVX2-NEXT:    [[TMP28:%.*]] = and <4 x i16> [[TMP27]], splat (i16 1)
+; AVX2-NEXT:    [[TMP29:%.*]] = add nuw <4 x i16> [[TMP26]], [[TMP28]]
+; AVX2-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i16> [[TMP29]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX2-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP30]] to i64
+; AVX2-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP31]], 1
+; AVX2-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+; AVX512-LABEL: @avgr_8_u16_alt(
+; AVX512-NEXT:  entry:
+; AVX512-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[A_COERCE0:%.*]], i64 0
+; AVX512-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP2:%.*]] = lshr <4 x i64> [[TMP1]], <i64 48, i64 32, i64 16, i64 0>
+; AVX512-NEXT:    [[TMP3:%.*]] = trunc <4 x i64> [[TMP2]] to <4 x i16>
+; AVX512-NEXT:    [[TMP4:%.*]] = insertelement <4 x i64> poison, i64 [[B_COERCE0:%.*]], i64 0
+; AVX512-NEXT:    [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP6:%.*]] = lshr <4 x i64> [[TMP5]], <i64 48, i64 32, i64 16, i64 0>
+; AVX512-NEXT:    [[TMP7:%.*]] = trunc <4 x i64> [[TMP6]] to <4 x i16>
+; AVX512-NEXT:    [[TMP8:%.*]] = lshr <4 x i16> [[TMP3]], splat (i16 1)
+; AVX512-NEXT:    [[TMP9:%.*]] = lshr <4 x i16> [[TMP7]], splat (i16 1)
+; AVX512-NEXT:    [[TMP10:%.*]] = add nuw <4 x i16> [[TMP9]], [[TMP8]]
+; AVX512-NEXT:    [[TMP11:%.*]] = or <4 x i16> [[TMP7]], [[TMP3]]
+; AVX512-NEXT:    [[TMP12:%.*]] = and <4 x i16> [[TMP11]], splat (i16 1)
+; AVX512-NEXT:    [[TMP13:%.*]] = add nuw <4 x i16> [[TMP10]], [[TMP12]]
+; AVX512-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i16> [[TMP13]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP15:%.*]] = bitcast <4 x i16> [[TMP14]] to i64
+; AVX512-NEXT:    [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP15]], 0
+; AVX512-NEXT:    [[TMP16:%.*]] = insertelement <4 x i64> poison, i64 [[A_COERCE1:%.*]], i64 0
+; AVX512-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i64> [[TMP16]], <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP18:%.*]] = lshr <4 x i64> [[TMP17]], <i64 48, i64 32, i64 16, i64 0>
+; AVX512-NEXT:    [[TMP19:%.*]] = trunc <4 x i64> [[TMP18]] to <4 x i16>
+; AVX512-NEXT:    [[TMP20:%.*]] = insertelement <4 x i64> poison, i64 [[B_COERCE1:%.*]], i64 0
+; AVX512-NEXT:    [[TMP21:%.*]] = shufflevector <4 x i64> [[TMP20]], <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX512-NEXT:    [[TMP22:%.*]] = lshr <4 x i64> [[TMP21]], <i64 48, i64 32, i64 16, i64 0>
+; AVX512-NEXT:    [[TMP23:%.*]] = trunc <4 x i64> [[TMP22]] to <4 x i16>
+; AVX512-NEXT:    [[TMP24:%.*]] = lshr <4 x i16> [[TMP19]], splat (i16 1)
+; AVX512-NEXT:    [[TMP25:%.*]] = lshr <4 x i16> [[TMP23]], splat (i16 1)
+; AVX512-NEXT:    [[TMP26:%.*]] = add nuw <4 x i16> [[TMP25]], [[TMP24]]
+; AVX512-NEXT:    [[TMP27:%.*]] = or <4 x i16> [[TMP23]], [[TMP19]]
+; AVX512-NEXT:    [[TMP28:%.*]] = and <4 x i16> [[TMP27]], splat (i16 1)
+; AVX512-NEXT:    [[TMP29:%.*]] = add nuw <4 x i16> [[TMP26]], [[TMP28]]
+; AVX512-NEXT:    [[TMP30:%.*]] = shufflevector <4 x i16> [[TMP29]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; AVX512-NEXT:    [[TMP31:%.*]] = bitcast <4 x i16> [[TMP30]] to i64
+; AVX512-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP31]], 1
+; AVX512-NEXT:    ret { i64, i64 } [[DOTFCA_1_INSERT]]
+;
+entry:
+  %retval = alloca %"struct.std::array8", align 2
+  %a = alloca %"struct.std::array8", align 2
+  %b = alloca %"struct.std::array8", align 2
+  store i64 %a.coerce0, ptr %a, align 2
+  %0 = getelementptr inbounds nuw i8, ptr %a, i64 8
+  store i64 %a.coerce1, ptr %0, align 2
+  store i64 %b.coerce0, ptr %b, align 2
+  %1 = getelementptr inbounds nuw i8, ptr %b, i64 8
+  store i64 %b.coerce1, ptr %1, align 2
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %cmp = icmp samesign ult i64 %i.0, 8
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %.fca.0.load = load i64, ptr %retval, align 2
+  %.fca.0.insert = insertvalue { i64, i64 } poison, i64 %.fca.0.load, 0
+  %.fca.1.gep = getelementptr inbounds nuw i8, ptr %retval, i64 8
+  %.fca.1.load = load i64, ptr %.fca.1.gep, align 2
+  %.fca.1.insert = insertvalue { i64, i64 } %.fca.0.insert, i64 %.fca.1.load, 1
+  ret { i64, i64 } %.fca.1.insert
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx.i = getelementptr inbounds nuw [2 x i8], ptr %a, i64 %i.0
+  %2 = load i16, ptr %arrayidx.i, align 2
+  %arrayidx.i22 = getelementptr inbounds nuw [2 x i8], ptr %b, i64 %i.0
+  %3 = load i16, ptr %arrayidx.i22, align 2
+  %shr = lshr i16 %2, 1
+  %shr5 = lshr i16 %3, 1
+  %narrow = add nuw i16 %shr, %shr5
+  %or21 = or i16 %2, %3
+  %4 = and i16 %or21, 1
+  %add12 = add i16 %narrow, %4
+  %arrayidx.i23 = getelementptr inbounds nuw [2 x i8], ptr %retval, i64 %i.0
+  store i16 %add12, ptr %arrayidx.i23, align 2
+  %inc = add nuw nsw i64 %i.0, 1
+  br label %for.cond
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
index dcfebe32302be..b2e07e77b05c5 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
@@ -48,6 +48,8 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2:       vector.ph:
 ; AVX2-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP3]], 24
 ; AVX2-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP3]], 9223372036854775776
+; AVX2-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 2
+; AVX2-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -83,8 +85,6 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; AVX2:       vec.epilog.iter.check:
-; AVX2-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 2
-; AVX2-NEXT:    [[IND_END11:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP26]]
 ; AVX2-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
 ; AVX2-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[BB12_PREHEADER1]], label [[BB12_PREHEADER11]], !prof [[PROF3:![0-9]+]]
 ; AVX2:       vec.epilog.ph:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
index 541f2cbe29702..92569b4846f79 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@@ -279,15 +279,14 @@ define i1 @cmp_lt_gt(double %a, double %b, double %c) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i64 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = fdiv <2 x double> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fcmp olt <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D)
+; CHECK-NEXT:    [[TMP8:%.*]] = fcmp uge <2 x double> [[TMP7]], splat (double 0x3EB0C6F7A0B5ED8D)
 ; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <2 x i1> [[TMP8]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP9:%.*]] = and <2 x i1> [[TMP8]], [[SHIFT]]
-; CHECK-NEXT:    [[OR_COND:%.*]] = extractelement <2 x i1> [[TMP9]], i64 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = or <2 x i1> [[TMP8]], [[SHIFT]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = fcmp ule <2 x double> [[TMP7]], splat (double 1.000000e+00)
 ; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <2 x i1> [[TMP10]], <2 x i1> poison, <2 x i32> <i32 1, i32 poison>
 ; CHECK-NEXT:    [[TMP11:%.*]] = or <2 x i1> [[TMP10]], [[SHIFT2]]
-; CHECK-NEXT:    [[OR_COND1_NOT:%.*]] = extractelement <2 x i1> [[TMP11]], i64 0
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[OR_COND]], i1 false, i1 [[OR_COND1_NOT]]
+; CHECK-NEXT:    [[OR_COND1_NOT:%.*]] = and <2 x i1> [[OR_COND]], [[TMP11]]
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = extractelement <2 x i1> [[OR_COND1_NOT]], i64 0
 ; CHECK-NEXT:    ret i1 [[RETVAL_0]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-bswap.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-bswap.ll
new file mode 100644
index 0000000000000..2f2b31bc1bed0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-bswap.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -slp-vectorize-non-power-of-2 -mtriple=x86_64-unknown-linux-gnu -mcpu=tigerlake < %s | FileCheck %s
+
+define i32 @test(i8 %0) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i8 [[TMP0:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x i8> poison, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <3 x i8> [[TMP2]], <3 x i8> poison, <3 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = and <3 x i8> [[TMP3]], splat (i8 1)
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <3 x i8> [[TMP4]] to <3 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw <3 x i32> [[TMP5]], <i32 16, i32 8, i32 0>
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[TMP6]])
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %2 = and i8 %0, 1
+  %3 = and i8 %0, 1
+  %.sroa.5.0.insert.ext = zext nneg i8 %3 to i32
+  %.sroa.5.0.insert.shift = shl nuw nsw i32 %.sroa.5.0.insert.ext, 16
+  %.sroa.3.0.insert.ext = zext nneg i8 %2 to i32
+  %.sroa.3.0.insert.shift = shl nuw nsw i32 %.sroa.3.0.insert.ext, 8
+  %.sroa.3.0.insert.insert = or disjoint i32 %.sroa.5.0.insert.shift, %.sroa.3.0.insert.shift
+  %4 = and i8 %0, 1
+  %.sroa.0.0.insert.ext = zext nneg i8 %4 to i32
+  %.sroa.0.0.insert.insert = or disjoint i32 %.sroa.3.0.insert.insert, %.sroa.0.0.insert.ext
+  ret i32 %.sroa.0.0.insert.insert
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-bswap-with-larger-reduced-type.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-bswap-with-larger-reduced-type.ll
new file mode 100644
index 0000000000000..04976d1183ff3
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-bswap-with-larger-reduced-type.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define i80 @test(i8 %0, i80 %.sroa.0.7.insert.ext) {
+; CHECK-LABEL: define i80 @test(
+; CHECK-SAME: i8 [[TMP0:%.*]], i80 [[DOTSROA_0_7_INSERT_EXT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i8> [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i128
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i128 [[TMP5]] to i80
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or disjoint i80 [[TMP6]], [[DOTSROA_0_7_INSERT_EXT]]
+; CHECK-NEXT:    ret i80 [[OP_RDX]]
+;
+entry:
+  %.sroa.0.6.insert.ext = zext i8 %0 to i80
+  %.sroa.0.3.insert.ext = zext i8 %0 to i80
+  %.sroa.0.3.insert.shift = shl nuw nsw i80 %.sroa.0.3.insert.ext, 24
+  %.sroa.0.2.insert.ext = zext i8 %0 to i80
+  %.sroa.0.2.insert.shift = shl nuw nsw i80 %.sroa.0.2.insert.ext, 16
+  %.sroa.0.1.insert.ext = zext i8 %0 to i80
+  %.sroa.0.1.insert.shift = shl nuw nsw i80 %.sroa.0.1.insert.ext, 8
+  %1 = or disjoint i80 %.sroa.0.7.insert.ext, %.sroa.0.6.insert.ext
+  %.sroa.0.3.insert.insert = or disjoint i80 %.sroa.0.3.insert.shift, %.sroa.0.2.insert.shift
+  %.sroa.0.8.insert.mask = or disjoint i80 %1, %.sroa.0.3.insert.insert
+  %.sroa.0.4.insert.mask69 = or disjoint i80 %.sroa.0.1.insert.shift, %.sroa.0.8.insert.mask
+  ret i80 %.sroa.0.4.insert.mask69
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/non-power-of-2-bswap.ll b/llvm/test/Transforms/SLPVectorizer/non-power-of-2-bswap.ll
index f0369e06d07fc..923663c2adb22 100644
--- a/llvm/test/Transforms/SLPVectorizer/non-power-of-2-bswap.ll
+++ b/llvm/test/Transforms/SLPVectorizer/non-power-of-2-bswap.ll
@@ -4,24 +4,14 @@
 define i64 @bswap_i24(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: define i64 @bswap_i24(
 ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[P1:%.*]]) {
-; CHECK-NEXT:    [[G2:%.*]] = getelementptr i8, ptr [[P]], i32 2
-; CHECK-NEXT:    [[T2:%.*]] = load i8, ptr [[G2]], align 1
-; CHECK-NEXT:    [[G12:%.*]] = getelementptr i8, ptr [[P1]], i32 2
-; CHECK-NEXT:    [[T12:%.*]] = load i8, ptr [[G12]], align 1
-; CHECK-NEXT:    [[A2:%.*]] = add i8 [[T2]], [[T12]]
-; CHECK-NEXT:    [[Z2:%.*]] = zext i8 [[A2]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[P]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i8>, ptr [[P1]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i8> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i8> [[TMP3]] to <2 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i32> [[TMP4]], <i32 16, i32 8>
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i8>, ptr [[P]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <3 x i8>, ptr [[P1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <3 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <3 x i8> [[TMP3]] to <3 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = shl <3 x i32> [[TMP4]], <i32 16, i32 8, i32 0>
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v3i32(<3 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[OR01:%.*]] = or disjoint i64 [[TMP7]], [[TMP9]]
-; CHECK-NEXT:    [[OR012:%.*]] = or disjoint i64 [[OR01]], [[Z2]]
-; CHECK-NEXT:    ret i64 [[OR012]]
+; CHECK-NEXT:    ret i64 [[TMP9]]
 ;
   %g1 = getelementptr i8, ptr %p, i32 1
   %g2 = getelementptr i8, ptr %p, i32 2
diff --git a/llvm/test/Transforms/SimplifyCFG/extract-cost.ll b/llvm/test/Transforms/SimplifyCFG/extract-cost.ll
index f99856b95328f..4290d9069e5bb 100644
--- a/llvm/test/Transforms/SimplifyCFG/extract-cost.ll
+++ b/llvm/test/Transforms/SimplifyCFG/extract-cost.ll
@@ -37,12 +37,8 @@ define i1 @PR32078(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    [[CMP0_NOT:%.*]] = xor i1 [[CMP0]], true
 ; CHECK-NEXT:    [[CMP1_NOT:%.*]] = xor i1 [[CMP1]], true
 ; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[CMP0_NOT]], i1 true, i1 [[CMP1_NOT]]
-; CHECK-NEXT:    br i1 [[BRMERGE]], label [[EXIT:%.*]], label [[CMP1_TRUE:%.*]]
-; CHECK:       cmp1_true:
 ; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[CMP2]], i1 [[CMP3]], i1 false
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[R:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[CMP1_TRUE]] ]
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[BRMERGE]], i1 false, i1 [[SPEC_SELECT]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
index 8ca034e5454a8..929d135ba6220 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-transformations-no-lut.ll
@@ -427,13 +427,9 @@ define i1 @single_value_with_mask(i32 %x) {
 ; TTINOLUT-NEXT:    [[SWITCH_SHIFTED:%.*]] = lshr i64 4294967333, [[SWITCH_MASKINDEX]]
 ; TTINOLUT-NEXT:    [[SWITCH_LOBIT:%.*]] = trunc i64 [[SWITCH_SHIFTED]] to i1
 ; TTINOLUT-NEXT:    [[OR_COND:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_LOBIT]], i1 false
-; TTINOLUT-NEXT:    br i1 [[OR_COND]], label %[[END:.*]], label %[[DEFAULT:.*]]
-; TTINOLUT:       [[DEFAULT]]:
 ; TTINOLUT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], 80
 ; TTINOLUT-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i1 false, i1 true
-; TTINOLUT-NEXT:    br label %[[END]]
-; TTINOLUT:       [[END]]:
-; TTINOLUT-NEXT:    [[RES:%.*]] = phi i1 [ [[SEL]], %[[DEFAULT]] ], [ false, %[[ENTRY]] ]
+; TTINOLUT-NEXT:    [[RES:%.*]] = select i1 [[OR_COND]], i1 false, i1 [[SEL]]
 ; TTINOLUT-NEXT:    ret i1 [[RES]]
 ;
 entry:
diff --git a/llvm/test/Transforms/StructurizeCFG/hoist-zerocost-nested.ll b/llvm/test/Transforms/StructurizeCFG/hoist-zerocost-nested.ll
new file mode 100644
index 0000000000000..c349709ea0f05
--- /dev/null
+++ b/llvm/test/Transforms/StructurizeCFG/hoist-zerocost-nested.ll
@@ -0,0 +1,151 @@
+; RUN: opt -passes=fix-irreducible,unify-loop-exits,structurizecfg -S %s | FileCheck %s
+;
+target triple = "amdgcn-amd-amdhsa"
+;
+; Reduced from rocPRIM block_sort_kernel compiled with code coverage.
+; The sort comparator `operator<` for custom_type uses nested short-circuit:
+;   less<T>(x) || (equal_to<T>(x) && less<U>(y))
+; With PGO instrumentation, each functor call gets a counter block,
+; creating a deeply nested CFG that StructurizeCFG must linearize.
+;
+; The bug: hoistZeroCostElseBlockPhiValues() hoists shufflevector out of
+; the "swap" block into a dominator, and simplifyHoistedPhis() then
+; fills poison entries in Flow phis indiscriminately — causing the
+; hoisted shufflevector result to reach the "no-swap" path, swapping
+; two sort keys that should not be swapped.
+;
+; After structurizecfg, the shufflevector that performs the conditional
+; swap must NOT appear before the first comparison block.
+
+ at counter_less = external addrspace(1) global [4 x i64]
+ at counter_eq = external addrspace(1) global [2 x i64]
+ at counter_swap = external addrspace(1) global [2 x i64]
+
+; Two consecutive compare-and-swap iterations from a sorting network.
+; Each iteration compares key_a vs key_b using:
+;   less(a.x, b.x) || (equal(a.x, b.x) && less(a.y, b.y))
+; If true, swap both keys and their index vector.
+;
+; iter1 compares keys[0] vs keys[1]
+; iter2 compares keys[1] vs keys[2], using the (possibly swapped) result of iter1
+;
+; The values_vec (<4 x i32>) tracks the value indices associated with the keys.
+; A shufflevector <0,2,1,3> swaps the middle two elements (the values for the two keys being compared).
+define amdgpu_kernel void @sort_two_iters(
+    i1 %do_iter1, i1 %do_iter2,
+    <2 x float> %key_a, <2 x float> %key_b, <2 x float> %key_c,
+    <4 x i32> %values_vec,
+    ptr addrspace(1) %out_values, ptr addrspace(1) %out_key
+) {
+entry:
+  %cnt_less0 = load i64, ptr addrspace(1) @counter_less, align 8
+  %cnt_eq0 = load i64, ptr addrspace(1) @counter_eq, align 8
+  %cnt_swap0 = load i64, ptr addrspace(1) @counter_swap, align 8
+  br i1 %do_iter1, label %iter1.cmp_x, label %iter1.done
+
+; --- Iteration 1: compare key_a vs key_b ---
+iter1.cmp_x:
+  ; PGO: counter for less<float>(a.x, b.x)
+  %cnt_less1 = add i64 %cnt_less0, 1
+  store i64 %cnt_less1, ptr addrspace(1) @counter_less, align 8
+  %ax = extractelement <2 x float> %key_a, i64 0
+  %bx = extractelement <2 x float> %key_b, i64 0
+  %x_less = fcmp olt float %ax, %bx
+  br i1 %x_less, label %iter1.do_swap, label %iter1.check_eq
+
+iter1.check_eq:
+  ; PGO: counter for equal_to<float>(a.x, b.x)
+  %cnt_eq1 = add i64 %cnt_eq0, 1
+  store i64 %cnt_eq1, ptr addrspace(1) @counter_eq, align 8
+  %x_eq = fcmp oeq float %ax, %bx
+  br i1 %x_eq, label %iter1.cmp_y, label %iter1.done
+
+iter1.cmp_y:
+  ; PGO: counter for less<float>(a.y, b.y)
+  %cnt_less2 = add i64 %cnt_less0, 2
+  store i64 %cnt_less2, ptr addrspace(1) @counter_less, align 8
+  %ay = extractelement <2 x float> %key_a, i64 1
+  %by = extractelement <2 x float> %key_b, i64 1
+  %y_less = fcmp olt float %ay, %by
+  br i1 %y_less, label %iter1.do_swap, label %iter1.done
+
+iter1.do_swap:
+  ; PGO: counter for swap
+  %cnt_swap1 = add i64 %cnt_swap0, 1
+  store i64 %cnt_swap1, ptr addrspace(1) @counter_swap, align 8
+  ; Swap middle two elements of values_vec (the indices for the two compared keys)
+  %swapped1 = shufflevector <4 x i32> %values_vec, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  br label %iter1.done
+
+iter1.done:
+  ; Select swapped or original values, and swapped keys
+  %vals1 = phi <4 x i32> [ %swapped1, %iter1.do_swap ], [ %values_vec, %iter1.check_eq ], [ %values_vec, %iter1.cmp_y ], [ %values_vec, %entry ]
+  %cnt_eq_out1 = phi i64 [ %cnt_eq0, %iter1.do_swap ], [ %cnt_eq1, %iter1.check_eq ], [ %cnt_eq1, %iter1.cmp_y ], [ %cnt_eq0, %entry ]
+  %cnt_less_out1 = phi i64 [ %cnt_less1, %iter1.do_swap ], [ %cnt_less1, %iter1.check_eq ], [ %cnt_less2, %iter1.cmp_y ], [ %cnt_less0, %entry ]
+  ; Swapped keys: if swap happened, key_a and key_b switch
+  %key_lo1 = phi <2 x float> [ %key_b, %iter1.do_swap ], [ %key_a, %iter1.check_eq ], [ %key_a, %iter1.cmp_y ], [ %key_a, %entry ]
+  %key_hi1 = phi <2 x float> [ %key_a, %iter1.do_swap ], [ %key_b, %iter1.check_eq ], [ %key_b, %iter1.cmp_y ], [ %key_b, %entry ]
+  br i1 %do_iter2, label %iter2.cmp_x, label %iter2.done
+
+; --- Iteration 2: compare key_hi1 (from iter1) vs key_c ---
+iter2.cmp_x:
+  ; PGO: counter for less<float>
+  %cnt_less3 = add i64 %cnt_less_out1, 1
+  store i64 %cnt_less3, ptr addrspace(1) @counter_less, align 8
+  %cx = extractelement <2 x float> %key_c, i64 0
+  %hi1x = extractelement <2 x float> %key_hi1, i64 0
+  %x_less2 = fcmp olt float %hi1x, %cx
+  br i1 %x_less2, label %iter2.do_swap, label %iter2.check_eq
+
+iter2.check_eq:
+  ; PGO: counter for equal_to<float>
+  %cnt_eq2 = add i64 %cnt_eq_out1, 1
+  store i64 %cnt_eq2, ptr addrspace(1) @counter_eq, align 8
+  %x_eq2 = fcmp oeq float %hi1x, %cx
+  br i1 %x_eq2, label %iter2.cmp_y, label %iter2.done
+
+iter2.cmp_y:
+  ; PGO: counter for less<float>
+  %cnt_less4 = add i64 %cnt_less_out1, 2
+  store i64 %cnt_less4, ptr addrspace(1) @counter_less, align 8
+  %hi1y = extractelement <2 x float> %key_hi1, i64 1
+  %cy = extractelement <2 x float> %key_c, i64 1
+  %y_less2 = fcmp olt float %hi1y, %cy
+  br i1 %y_less2, label %iter2.do_swap, label %iter2.done
+
+iter2.do_swap:
+  ; PGO: counter for swap
+  %cnt_swap2 = add i64 %cnt_swap0, 2
+  store i64 %cnt_swap2, ptr addrspace(1) @counter_swap, align 8
+  ; Swap middle two elements of vals1 (the values that resulted from iter1)
+  %swapped2 = shufflevector <4 x i32> %vals1, <4 x i32> poison, <4 x i32> <i32 1, i32 0, i32 2, i32 3>
+  br label %iter2.done
+
+iter2.done:
+  %vals2 = phi <4 x i32> [ %swapped2, %iter2.do_swap ], [ %vals1, %iter2.check_eq ], [ %vals1, %iter2.cmp_y ], [ %vals1, %iter1.done ]
+  store <4 x i32> %vals2, ptr addrspace(1) %out_values, align 16
+  ret void
+}
+
+; After structurizecfg, verify that the shufflevector for iter1's swap
+; is NOT hoisted into iter1.cmp_x. It must stay in iter1.do_swap (or
+; a Flow block gated by the swap predicate) so that the no-swap path
+; never sees the swapped values.
+;
+; CHECK-LABEL: @sort_two_iters
+;
+; The shufflevector must NOT appear in iter1.cmp_x (it would be hoisted there by the bug):
+; CHECK:      iter1.cmp_x:
+; CHECK-NOT:  shufflevector
+; CHECK:      br i1
+;
+; It should remain in iter1.do_swap:
+; CHECK:      iter1.do_swap:
+; CHECK:      %swapped1 = shufflevector <4 x i32> %values_vec, <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+;
+; The vals1 phi at iter1.done must select between the Flow block result
+; (which should carry both swapped and unswapped correctly) and the
+; entry bypass. It must NOT directly carry the hoisted shufflevector result
+; from a block that's reachable without going through the swap predicate.
+; CHECK:      iter1.done:
+; CHECK:      %vals1 = phi <4 x i32>
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index 17bf5a0528d73..2b1f971e6e91f 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -1,6 +1,6 @@
 ; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
-; CHECK: atomic store operand must have integer, pointer, floating point, or vector type!
-; CHECK: atomic load operand must have integer, pointer, floating point, or vector type!
+; CHECK: atomic store operand must have integer, byte, pointer, floating point, or vector type!
+; CHECK: atomic load operand must have integer, byte, pointer, floating point, or vector type!
 
 %ty = type { i32 };
 
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 36d1a34a2fcdf..af704d67b91f2 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -827,6 +827,8 @@ def host_unwind_supports_jit():
 if "MemoryWithOrigins" in config.llvm_use_sanitizer:
     config.available_features.add("use_msan_with_origins")
 
+if "Undefined" in config.llvm_use_sanitizer:
+    config.available_features.add("ubsan")
 
 # Restrict the size of the on-disk CAS for tests. This allows testing in
 # constrained environments (e.g. small TMPDIR). It also prevents leaving
diff --git a/llvm/test/tools/llvm-dwarfdump/X86/callsite-in-lexical-block.s b/llvm/test/tools/llvm-dwarfdump/X86/callsite-in-lexical-block.s
new file mode 100644
index 0000000000000..364e69523da0b
--- /dev/null
+++ b/llvm/test/tools/llvm-dwarfdump/X86/callsite-in-lexical-block.s
@@ -0,0 +1,104 @@
+## Test that llvm-dwarfdump --verify correctly handles DW_TAG_call_site
+## nested inside a DW_TAG_lexical_block. Previously, the parent-walking
+## loop in verifyDebugInfoCallSite() used Die.getParent() instead of
+## Curr.getParent(), causing an infinite loop when the call_site's
+## immediate parent was not a subprogram (e.g., a lexical_block).
+
+# RUN: yaml2obj %s -o - | llvm-dwarfdump --verify - 2>&1 | FileCheck %s
+
+# CHECK: No errors.
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+DWARF:
+  debug_str:
+    - "callee"
+    - "caller"
+
+  debug_abbrev:
+    - Table:
+      - Code:            0x00000001
+        Tag:             DW_TAG_compile_unit
+        Children:        DW_CHILDREN_yes
+        Attributes:
+          - Attribute:       DW_AT_low_pc
+            Form:            DW_FORM_addr
+      - Code:            0x00000002
+        Tag:             DW_TAG_subprogram
+        Children:        DW_CHILDREN_no
+        Attributes:
+          - Attribute:       DW_AT_name
+            Form:            DW_FORM_strp
+          - Attribute:       DW_AT_low_pc
+            Form:            DW_FORM_addr
+          - Attribute:       DW_AT_high_pc
+            Form:            DW_FORM_data4
+      - Code:            0x00000003
+        Tag:             DW_TAG_subprogram
+        Children:        DW_CHILDREN_yes
+        Attributes:
+          - Attribute:       DW_AT_name
+            Form:            DW_FORM_strp
+          - Attribute:       DW_AT_low_pc
+            Form:            DW_FORM_addr
+          - Attribute:       DW_AT_high_pc
+            Form:            DW_FORM_data4
+          - Attribute:       DW_AT_call_all_calls
+            Form:            DW_FORM_flag_present
+      - Code:            0x00000004
+        Tag:             DW_TAG_lexical_block
+        Children:        DW_CHILDREN_yes
+        Attributes:
+          - Attribute:       DW_AT_low_pc
+            Form:            DW_FORM_addr
+          - Attribute:       DW_AT_high_pc
+            Form:            DW_FORM_data4
+      - Code:            0x00000005
+        Tag:             DW_TAG_call_site
+        Children:        DW_CHILDREN_no
+        Attributes:
+          - Attribute:       DW_AT_call_origin
+            Form:            DW_FORM_ref4
+
+  debug_info:
+    - Version:         5
+      UnitType:        DW_UT_compile
+      AbbrOffset:      0x00000000
+      AddrSize:        8
+      Entries:
+        # DW_TAG_compile_unit
+        - AbbrCode:    0x00000001
+          Values:
+            - Value:   0x0000000000000000   # DW_AT_low_pc
+        # DW_TAG_subprogram (callee) - offset will be 0x0c + some bytes
+        - AbbrCode:    0x00000002
+          Values:
+            - Value:   0x0000000000000000   # DW_AT_name -> "callee"
+            - Value:   0x0000000000003000   # DW_AT_low_pc
+            - Value:   0x0000000000000010   # DW_AT_high_pc
+        # DW_TAG_subprogram (caller, with DW_AT_call_all_calls)
+        - AbbrCode:    0x00000003
+          Values:
+            - Value:   0x0000000000000007   # DW_AT_name -> "caller"
+            - Value:   0x0000000000001000   # DW_AT_low_pc
+            - Value:   0x0000000000000100   # DW_AT_high_pc
+        # DW_TAG_lexical_block (child of caller)
+        - AbbrCode:    0x00000004
+          Values:
+            - Value:   0x0000000000001000   # DW_AT_low_pc
+            - Value:   0x0000000000000080   # DW_AT_high_pc
+        # DW_TAG_call_site (child of lexical_block)
+        - AbbrCode:    0x00000005
+          Values:
+            - Value:   0x0000000000000015   # DW_AT_call_origin -> callee subprogram
+        - AbbrCode:    0x00000000           # end lexical_block
+          Values:      []
+        - AbbrCode:    0x00000000           # end caller subprogram
+          Values:      []
+        - AbbrCode:    0x00000000           # end compile_unit
+          Values:      []
+...
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py
deleted file mode 100644
index d3a1cdd6591ad..0000000000000
--- a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-bindings.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# RUN: env PYTHONPATH=%llvm_lib_dir %python %s %S/../Inputs/input.ll %ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json | FileCheck %s
-
-import sys
-import ir2vec
-
-ll_file = sys.argv[1]
-vocab_path = sys.argv[2]
-
-tool = ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=vocab_path)
-
-if tool is not None:
-    print("SUCCESS: Tool initialized")
-    print(f"Tool type: {type(tool).__name__}")
-
-    # Test getFuncNames
-    print("\n=== Function Names ===")
-    func_names = tool.getFuncNames()
-    for func_name in sorted(func_names):
-        print(f"Function: {func_name}")
-
-    # Test getFuncEmbMap
-    print("\n=== Function Embeddings ===")
-    func_emb_map = tool.getFuncEmbMap()
-
-    # Sorting the function names for fixed-ordered output
-    for func_name in sorted(func_emb_map.keys()):
-        emb = func_emb_map[func_name]
-        print(f"Function: {func_name}")
-        print(f"  Embedding: {emb.tolist()}")
-
-    # Test getFuncEmb for individual functions
-    print("\n=== Single Function Embeddings ===")
-
-    # Test valid function names
-    for func_name in ["add", "multiply", "conditional"]:
-        func_emb = tool.getFuncEmb(func_name)
-        print(f"Function: {func_name}")
-        print(f"  Embedding: {func_emb.tolist()}")
-
-    # Test getBBEmbMap
-    print("\n=== Basic Block Embeddings ===")
-
-    # Test valid function names in sorted order
-    for func_name in sorted(["add", "multiply", "conditional"]):
-        bb_emb_map = tool.getBBEmbMap(func_name)
-        print(f"Function: {func_name}")
-        for bb_name in sorted(bb_emb_map.keys()):
-            emb = bb_emb_map[bb_name]
-            print(f"  BB: {bb_name}")
-            print(f"    Embedding: {emb.tolist()}")
-
-    # Test getInstEmbMap
-    print("\n=== Instruction Embeddings ===")
-
-    # Test valid function names in sorted order
-    for func_name in sorted(["add", "multiply", "conditional"]):
-        inst_emb_map = tool.getInstEmbMap(func_name)
-        print(f"Function: {func_name}")
-        for inst_str in sorted(inst_emb_map.keys()):
-            emb = inst_emb_map[inst_str]
-            print(f"  Inst: {inst_str}")
-            print(f"    Embedding: {emb.tolist()}")
-
-# CHECK: SUCCESS: Tool initialized
-# CHECK: Tool type: IR2VecTool
-# CHECK: === Function Names ===
-# CHECK: Function: add
-# CHECK: Function: conditional
-# CHECK: Function: multiply
-# CHECK-NOT:  Function: external_func
-# CHECK: === Function Embeddings ===
-# CHECK: Function: add
-# CHECK-NEXT:   Embedding: [38.0, 40.0, 42.0]
-# CHECK: Function: conditional
-# CHECK-NEXT:   Embedding: [413.20000000298023, 421.20000000298023, 429.20000000298023]
-# CHECK: Function: multiply
-# CHECK-NEXT:   Embedding: [50.0, 52.0, 54.0]
-# CHECK: === Single Function Embeddings ===
-# CHECK: Function: add
-# CHECK-NEXT:   Embedding: [38.0, 40.0, 42.0]
-# CHECK: Function: multiply
-# CHECK-NEXT:   Embedding: [50.0, 52.0, 54.0]
-# CHECK: Function: conditional
-# CHECK-NEXT:   Embedding: [413.20000000298023, 421.20000000298023, 429.20000000298023]
-# CHECK: === Basic Block Embeddings ===
-# CHECK: Function: add
-# CHECK:   BB: entry
-# CHECK-NEXT:     Embedding: [38.0, 40.0, 42.0]
-# CHECK: Function: conditional
-# CHECK:   BB: entry
-# CHECK-NEXT:     Embedding: [161.20000000298023, 163.20000000298023, 165.20000000298023]
-# CHECK:   BB: exit
-# CHECK-NEXT:     Embedding: [164.0, 166.0, 168.0]
-# CHECK:   BB: negative
-# CHECK-NEXT:     Embedding: [47.0, 49.0, 51.0]
-# CHECK:   BB: positive
-# CHECK-NEXT:     Embedding: [41.0, 43.0, 45.0]
-# CHECK: Function: multiply
-# CHECK:   BB: entry
-# CHECK-NEXT:     Embedding: [50.0, 52.0, 54.0]
-# CHECK: === Instruction Embeddings ===
-# CHECK: Function: add
-# CHECK:   Inst: %sum = add i32 %a, %b
-# CHECK-NEXT:     Embedding: [37.0, 38.0, 39.0]
-# CHECK:   Inst: ret i32 %sum
-# CHECK-NEXT:     Embedding: [1.0, 2.0, 3.0]
-# CHECK: Function: conditional
-# CHECK:   Inst: %cmp = icmp sgt i32 %n, 0
-# CHECK-NEXT:     Embedding: [157.20000000298023, 158.20000000298023, 159.20000000298023]
-# CHECK:   Inst: %neg_val = sub i32 %n, 10
-# CHECK-NEXT:     Embedding: [43.0, 44.0, 45.0]
-# CHECK:   Inst: %pos_val = add i32 %n, 10
-# CHECK-NEXT:     Embedding: [37.0, 38.0, 39.0]
-# CHECK:   Inst: %result = phi i32 [ %pos_val, %positive ], [ %neg_val, %negative ]
-# CHECK-NEXT:     Embedding: [163.0, 164.0, 165.0]
-# CHECK:   Inst: br i1 %cmp, label %positive, label %negative
-# CHECK-NEXT:     Embedding: [4.0, 5.0, 6.0]
-# CHECK:   Inst: br label %exit
-# CHECK-NEXT:     Embedding: [4.0, 5.0, 6.0]
-# CHECK:   Inst: ret i32 %result
-# CHECK-NEXT:     Embedding: [1.0, 2.0, 3.0]
-# CHECK: Function: multiply
-# CHECK:   Inst: %prod = mul i32 %x, %y
-# CHECK-NEXT:     Embedding: [49.0, 50.0, 51.0]
-# CHECK:   Inst: ret i32 %prod
-# CHECK-NEXT:     Embedding: [1.0, 2.0, 3.0]
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-exceptions.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-exceptions.py
deleted file mode 100644
index af96be07c2364..0000000000000
--- a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-exceptions.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# RUN: env PYTHONPATH=%llvm_lib_dir %python %s | FileCheck %s
-
-import ir2vec
-
-
-def test_invalid_file():
-    """Test that invalid file path raises ValueError"""
-    try:
-        tool = ir2vec.initEmbedding(
-            filename="/this/does/not/exist.ll",
-            mode="sym",
-            vocabPath="/also/fake/vocab.json",
-        )
-        return "FAIL: No exception raised"
-    except ValueError as e:
-        return f"PASS: ValueError raised - {str(e)[:40]}"
-    except Exception as e:
-        return f"FAIL: Wrong exception - {type(e).__name__}"
-
-
-def test_empty_filename():
-    """Test that empty filename raises ValueError"""
-    try:
-        tool = ir2vec.initEmbedding(filename="", mode="sym", vocabPath="dummy.json")
-        return "FAIL: No exception raised"
-    except ValueError:
-        return "PASS: ValueError raised for empty filename"
-    except Exception as e:
-        return f"FAIL: Wrong exception - {type(e).__name__}"
-
-
-result1 = test_invalid_file()
-print(f"Test 1: {result1}")
-# CHECK: Test 1: PASS: ValueError raised
-
-result2 = test_empty_filename()
-print(f"Test 2: {result2}")
-# CHECK: Test 2: PASS: ValueError raised
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getBBEmbMap.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getBBEmbMap.py
new file mode 100644
index 0000000000000..333feadc6c932
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getBBEmbMap.py
@@ -0,0 +1,25 @@
+# RUN: env PYTHONPATH=%llvm_lib_dir %python %s %S/../Inputs/input.ll %ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json | FileCheck %s
+
+import sys
+import ir2vec
+
+ll_file = sys.argv[1]
+vocab_path = sys.argv[2]
+
+tool = ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=vocab_path)
+
+# Success case
+bb_map = tool.getBBEmbMap("conditional")
+for bb in sorted(bb_map.keys()):
+    print(f"BB: {bb}, EMB: {bb_map[bb].tolist()}")
+# CHECK: BB: entry, EMB: [161.20000000298023, 163.20000000298023, 165.20000000298023]
+# CHECK: BB: exit, EMB: [164.0, 166.0, 168.0]
+# CHECK: BB: negative, EMB: [47.0, 49.0, 51.0]
+# CHECK: BB: positive, EMB: [41.0, 43.0, 45.0]
+
+# Error: Function not found
+try:
+    tool.getBBEmbMap("nonexistent")
+except ValueError:
+    print("ERROR: Function not found")
+# CHECK: ERROR: Function not found
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmb.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmb.py
new file mode 100644
index 0000000000000..61b9464c89757
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmb.py
@@ -0,0 +1,21 @@
+# RUN: env PYTHONPATH=%llvm_lib_dir %python %s %S/../Inputs/input.ll %ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json | FileCheck %s
+
+import sys
+import ir2vec
+
+ll_file = sys.argv[1]
+vocab_path = sys.argv[2]
+
+tool = ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=vocab_path)
+
+# Success case
+emb = tool.getFuncEmb("add")
+print(f"SUCCESS: {emb.tolist()}")
+# CHECK: SUCCESS: [38.0, 40.0, 42.0]
+
+# Error: Function not found
+try:
+    tool.getFuncEmb("nonexistent")
+except ValueError:
+    print("ERROR: Function not found")
+# CHECK: ERROR: Function not found
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmbMap.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmbMap.py
new file mode 100644
index 0000000000000..7600d5e4a2986
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncEmbMap.py
@@ -0,0 +1,18 @@
+# RUN: env PYTHONPATH=%llvm_lib_dir %python %s %S/../Inputs/input.ll %ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json | FileCheck %s
+
+import sys
+import ir2vec
+
+ll_file = sys.argv[1]
+vocab_path = sys.argv[2]
+
+tool = ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=vocab_path)
+
+# Success case
+emb_map = tool.getFuncEmbMap()
+for name in sorted(emb_map.keys()):
+    print(f"FUNC: {name}, EMB: {emb_map[name].tolist()}")
+
+# CHECK: FUNC: add, EMB: [38.0, 40.0, 42.0]
+# CHECK: FUNC: conditional, EMB: [413.20000000298023, 421.20000000298023, 429.20000000298023]
+# CHECK: FUNC: multiply, EMB: [50.0, 52.0, 54.0]
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncNames.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncNames.py
new file mode 100644
index 0000000000000..432d80e97edb9
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getFuncNames.py
@@ -0,0 +1,19 @@
+# RUN: env PYTHONPATH=%llvm_lib_dir %python %s %S/../Inputs/input.ll %ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json | FileCheck %s
+
+import sys
+import ir2vec
+
+ll_file = sys.argv[1]
+vocab_path = sys.argv[2]
+
+tool = ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=vocab_path)
+
+# Success case
+func_names = tool.getFuncNames()
+for name in sorted(func_names):
+    print(f"FUNC: {name}")
+
+# CHECK: FUNC: add
+# CHECK: FUNC: conditional
+# CHECK: FUNC: multiply
+# CHECK-NOT:  FUNC: external_func
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getInstEmbMap.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getInstEmbMap.py
new file mode 100644
index 0000000000000..3157ae34cfd3c
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-getInstEmbMap.py
@@ -0,0 +1,27 @@
+# RUN: env PYTHONPATH=%llvm_lib_dir %python %s %S/../Inputs/input.ll %ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json | FileCheck %s
+
+import sys
+import ir2vec
+
+ll_file = sys.argv[1]
+vocab_path = sys.argv[2]
+
+tool = ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=vocab_path)
+
+# Success case
+inst_map = tool.getInstEmbMap("add")
+for inst in sorted(inst_map.keys()):
+    print(f"INST: {inst}")
+    print(f"  EMB: {inst_map[inst].tolist()}")
+
+# CHECK: INST: %sum = add i32 %a, %b
+# CHECK:   EMB: [37.0, 38.0, 39.0]
+# CHECK: INST: ret i32 %sum
+# CHECK:   EMB: [1.0, 2.0, 3.0]
+
+# Error: Function not found
+try:
+    tool.getInstEmbMap("nonexistent")
+except ValueError:
+    print("ERROR: Function not found")
+# CHECK: ERROR: Function not found
diff --git a/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-initEmbedding.py b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-initEmbedding.py
new file mode 100644
index 0000000000000..c19935a0c6b7d
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/bindings/ir2vec-initEmbedding.py
@@ -0,0 +1,62 @@
+# RUN: env PYTHONPATH=%llvm_lib_dir %python %s %S/../Inputs/input.ll %ir2vec_test_vocab_dir/dummy_3D_nonzero_opc_vocab.json | FileCheck %s
+
+import sys
+import ir2vec
+
+ll_file = sys.argv[1]
+vocab_path = sys.argv[2]
+
+# Success case
+tool = ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=vocab_path)
+print(f"SUCCESS: {type(tool).__name__}")
+# CHECK: SUCCESS: IR2VecTool
+
+# Error: Invalid mode
+try:
+    ir2vec.initEmbedding(filename=ll_file, mode="invalid", vocabPath=vocab_path)
+except ValueError:
+    print("ERROR: Invalid mode")
+# CHECK: ERROR: Invalid mode
+
+# Error: Empty vocab path
+try:
+    ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath="")
+except ValueError:
+    print("ERROR: Empty vocab path")
+# CHECK: ERROR: Empty vocab path
+
+# Error: Invalid file
+try:
+    ir2vec.initEmbedding(filename="/bad.ll", mode="sym", vocabPath=vocab_path)
+except ValueError:
+    print("ERROR: Invalid file")
+# CHECK: ERROR: Invalid file
+
+# Error: Empty filename
+try:
+    ir2vec.initEmbedding(filename="", mode="sym", vocabPath=vocab_path)
+except ValueError:
+    print("ERROR: Empty filename")
+# CHECK: ERROR: Empty filename
+
+# Error: Invalid vocab file
+try:
+    ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath="/bad.json")
+except ValueError:
+    print("ERROR: Invalid vocab")
+# CHECK: ERROR: Invalid vocab
+
+# Error: Malformed JSON vocab
+import tempfile
+import os
+
+with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
+    f.write("{ this is not valid json }")
+    bad_vocab = f.name
+try:
+    ir2vec.initEmbedding(filename=ll_file, mode="sym", vocabPath=bad_vocab)
+except ValueError:
+    print("ERROR: Invalid vocab file")
+finally:
+    os.unlink(bad_vocab)
+# CHECK: ERROR: Invalid vocab file
diff --git a/llvm/test/tools/llvm-ir2vec/entities.ll b/llvm/test/tools/llvm-ir2vec/entities.ll
index 6b2e2e4db2ff6..002614164c128 100644
--- a/llvm/test/tools/llvm-ir2vec/entities.ll
+++ b/llvm/test/tools/llvm-ir2vec/entities.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-ir2vec entities | FileCheck %s
 
-CHECK: 111
+CHECK: 112
 CHECK-NEXT: Ret	0
 CHECK-NEXT: UncondBr	1
 CHECK-NEXT: CondBr	2
@@ -77,38 +77,39 @@ CHECK-NEXT: MetadataTy	72
 CHECK-NEXT: VectorTy	73
 CHECK-NEXT: TokenTy	74
 CHECK-NEXT: IntegerTy	75
-CHECK-NEXT: FunctionTy	76
-CHECK-NEXT: PointerTy	77
-CHECK-NEXT: StructTy	78
-CHECK-NEXT: ArrayTy	79
-CHECK-NEXT: UnknownTy	80
-CHECK-NEXT: Function	81
-CHECK-NEXT: Pointer	82
-CHECK-NEXT: Constant	83
-CHECK-NEXT: Variable	84
-CHECK-NEXT: FCMP_false	85
-CHECK-NEXT: FCMP_oeq	86
-CHECK-NEXT: FCMP_ogt	87
-CHECK-NEXT: FCMP_oge	88
-CHECK-NEXT: FCMP_olt	89
-CHECK-NEXT: FCMP_ole	90
-CHECK-NEXT: FCMP_one	91
-CHECK-NEXT: FCMP_ord	92
-CHECK-NEXT: FCMP_uno	93
-CHECK-NEXT: FCMP_ueq	94
-CHECK-NEXT: FCMP_ugt	95
-CHECK-NEXT: FCMP_uge	96
-CHECK-NEXT: FCMP_ult	97
-CHECK-NEXT: FCMP_ule	98
-CHECK-NEXT: FCMP_une	99
-CHECK-NEXT: FCMP_true	100
-CHECK-NEXT: ICMP_eq	101
-CHECK-NEXT: ICMP_ne	102
-CHECK-NEXT: ICMP_ugt	103
-CHECK-NEXT: ICMP_uge	104
-CHECK-NEXT: ICMP_ult	105
-CHECK-NEXT: ICMP_ule	106
-CHECK-NEXT: ICMP_sgt	107
-CHECK-NEXT: ICMP_sge	108
-CHECK-NEXT: ICMP_slt	109
-CHECK-NEXT: ICMP_sle	110
+CHECK-NEXT: ByteTy	76
+CHECK-NEXT: FunctionTy	77
+CHECK-NEXT: PointerTy	78
+CHECK-NEXT: StructTy	79
+CHECK-NEXT: ArrayTy	80
+CHECK-NEXT: UnknownTy	81
+CHECK-NEXT: Function	82
+CHECK-NEXT: Pointer	83
+CHECK-NEXT: Constant	84
+CHECK-NEXT: Variable	85
+CHECK-NEXT: FCMP_false	86
+CHECK-NEXT: FCMP_oeq	87
+CHECK-NEXT: FCMP_ogt	88
+CHECK-NEXT: FCMP_oge	89
+CHECK-NEXT: FCMP_olt	90
+CHECK-NEXT: FCMP_ole	91
+CHECK-NEXT: FCMP_one	92
+CHECK-NEXT: FCMP_ord	93
+CHECK-NEXT: FCMP_uno	94
+CHECK-NEXT: FCMP_ueq	95
+CHECK-NEXT: FCMP_ugt	96
+CHECK-NEXT: FCMP_uge	97
+CHECK-NEXT: FCMP_ult	98
+CHECK-NEXT: FCMP_ule	99
+CHECK-NEXT: FCMP_une	100
+CHECK-NEXT: FCMP_true	101
+CHECK-NEXT: ICMP_eq	102
+CHECK-NEXT: ICMP_ne	103
+CHECK-NEXT: ICMP_ugt	104
+CHECK-NEXT: ICMP_uge	105
+CHECK-NEXT: ICMP_ult	106
+CHECK-NEXT: ICMP_ule	107
+CHECK-NEXT: ICMP_sgt	108
+CHECK-NEXT: ICMP_sge	109
+CHECK-NEXT: ICMP_slt	110
+CHECK-NEXT: ICMP_sle	111
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.ll b/llvm/test/tools/llvm-ir2vec/triplets.ll
index 7632e236c4107..45a158c926900 100644
--- a/llvm/test/tools/llvm-ir2vec/triplets.ll
+++ b/llvm/test/tools/llvm-ir2vec/triplets.ll
@@ -26,40 +26,40 @@ entry:
 
 ; TRIPLETS: MAX_RELATION=3
 ; TRIPLETS-NEXT: 13	75	0
-; TRIPLETS-NEXT: 13	84	2
-; TRIPLETS-NEXT: 13	84	3
+; TRIPLETS-NEXT: 13	85	2
+; TRIPLETS-NEXT: 13	85	3
 ; TRIPLETS-NEXT: 13	0	1
 ; TRIPLETS-NEXT: 0	70	0
-; TRIPLETS-NEXT: 0	84	2
+; TRIPLETS-NEXT: 0	85	2
 ; TRIPLETS-NEXT: 17	75	0
-; TRIPLETS-NEXT: 17	84	2
-; TRIPLETS-NEXT: 17	84	3
+; TRIPLETS-NEXT: 17	85	2
+; TRIPLETS-NEXT: 17	85	3
 ; TRIPLETS-NEXT: 17	0	1
 ; TRIPLETS-NEXT: 0	70	0
-; TRIPLETS-NEXT: 0	84	2
-; TRIPLETS-NEXT: 31	77	0
-; TRIPLETS-NEXT: 31	83	2
+; TRIPLETS-NEXT: 0	85	2
+; TRIPLETS-NEXT: 31	78	0
+; TRIPLETS-NEXT: 31	84	2
 ; TRIPLETS-NEXT: 31	31	1
-; TRIPLETS-NEXT: 31	77	0
-; TRIPLETS-NEXT: 31	83	2
+; TRIPLETS-NEXT: 31	78	0
+; TRIPLETS-NEXT: 31	84	2
 ; TRIPLETS-NEXT: 31	33	1
 ; TRIPLETS-NEXT: 33	70	0
-; TRIPLETS-NEXT: 33	84	2
-; TRIPLETS-NEXT: 33	82	3
+; TRIPLETS-NEXT: 33	85	2
+; TRIPLETS-NEXT: 33	83	3
 ; TRIPLETS-NEXT: 33	33	1
 ; TRIPLETS-NEXT: 33	70	0
-; TRIPLETS-NEXT: 33	84	2
-; TRIPLETS-NEXT: 33	82	3
+; TRIPLETS-NEXT: 33	85	2
+; TRIPLETS-NEXT: 33	83	3
 ; TRIPLETS-NEXT: 33	32	1
 ; TRIPLETS-NEXT: 32	75	0
-; TRIPLETS-NEXT: 32	82	2
+; TRIPLETS-NEXT: 32	83	2
 ; TRIPLETS-NEXT: 32	32	1
 ; TRIPLETS-NEXT: 32	75	0
-; TRIPLETS-NEXT: 32	82	2
+; TRIPLETS-NEXT: 32	83	2
 ; TRIPLETS-NEXT: 32	13	1
 ; TRIPLETS-NEXT: 13	75	0
-; TRIPLETS-NEXT: 13	84	2
-; TRIPLETS-NEXT: 13	84	3
+; TRIPLETS-NEXT: 13	85	2
+; TRIPLETS-NEXT: 13	85	3
 ; TRIPLETS-NEXT: 13	0	1
 ; TRIPLETS-NEXT: 0	70	0
-; TRIPLETS-NEXT: 0	84	2
+; TRIPLETS-NEXT: 0	85	2
diff --git a/llvm/test/tools/llvm-mc/x86-asm-syntax.test b/llvm/test/tools/llvm-mc/x86-asm-syntax.test
new file mode 100644
index 0000000000000..2452d572701f8
--- /dev/null
+++ b/llvm/test/tools/llvm-mc/x86-asm-syntax.test
@@ -0,0 +1,10 @@
+# REQUIRES: x86-registered-target
+# RUN: llvm-mc -triple=x86_64 -x86-asm-syntax=intel %s | FileCheck %s --check-prefix=INTEL
+# RUN: llvm-mc -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=0 %s | FileCheck %s --check-prefix=ATT
+
+## -x86-asm-syntax=intel without --output-asm-variant defaults to Intel output.
+# INTEL: mov ebx, eax
+
+## Explicit --output-asm-variant=0 overrides to AT&T output.
+# ATT: movl %eax, %ebx
+mov ebx, eax
diff --git a/llvm/tools/llvm-c-test/echo.cpp b/llvm/tools/llvm-c-test/echo.cpp
index 2f29933c8fe44..7b045314207b8 100644
--- a/llvm/tools/llvm-c-test/echo.cpp
+++ b/llvm/tools/llvm-c-test/echo.cpp
@@ -89,6 +89,8 @@ struct TypeCloner {
         return LLVMPPCFP128TypeInContext(Ctx);
       case LLVMLabelTypeKind:
         return LLVMLabelTypeInContext(Ctx);
+      case LLVMByteTypeKind:
+        return LLVMByteTypeInContext(Ctx, LLVMGetByteTypeWidth(Src));
       case LLVMIntegerTypeKind:
         return LLVMIntTypeInContext(Ctx, LLVMGetIntTypeWidth(Src));
       case LLVMFunctionTypeKind: {
@@ -568,10 +570,10 @@ struct FunCloner {
       }
       case LLVMCondBr: {
         LLVMValueRef Cond = LLVMGetCondition(Src);
-        LLVMValueRef Else = LLVMGetOperand(Src, 1);
-        LLVMBasicBlockRef ElseBB = DeclareBB(LLVMValueAsBasicBlock(Else));
-        LLVMValueRef Then = LLVMGetOperand(Src, 2);
+        LLVMValueRef Then = LLVMGetOperand(Src, 1);
         LLVMBasicBlockRef ThenBB = DeclareBB(LLVMValueAsBasicBlock(Then));
+        LLVMValueRef Else = LLVMGetOperand(Src, 2);
+        LLVMBasicBlockRef ElseBB = DeclareBB(LLVMValueAsBasicBlock(Else));
         Dst = LLVMBuildCondBr(Builder, CloneValue(Cond), ThenBB, ElseBB);
         break;
       }
diff --git a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
index 9f9d59acba214..e78edf854d17c 100644
--- a/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
+++ b/llvm/tools/llvm-gpu-loader/llvm-gpu-loader.cpp
@@ -268,7 +268,9 @@ int main(int argc, const char **argv, const char **envp) {
   void *DevEnvp = copyEnvironment(envp, Device);
 
   void *DevRet;
+  int Zero = 0;
   OFFLOAD_ERR(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, sizeof(int), &DevRet));
+  OFFLOAD_ERR(olMemcpy(Queue, DevRet, Device, &Zero, Host, sizeof(int)));
 
   ol_kernel_launch_size_args_t BeginLaunch{1, {1, 1, 1}, {1, 1, 1}, 0};
   BeginArgs BeginArgs = {DevArgc, DevArgv, DevEnvp};
@@ -291,6 +293,7 @@ int main(int argc, const char **argv, const char **envp) {
   OFFLOAD_ERR(olMemcpy(Queue, &Ret, Host, DevRet, Device, sizeof(int)));
   OFFLOAD_ERR(olSyncQueue(Queue));
 
+  OFFLOAD_ERR(olMemFree(DevRet));
   OFFLOAD_ERR(olMemFree(DevArgv));
   OFFLOAD_ERR(olMemFree(DevEnvp));
   OFFLOAD_ERR(olDestroyQueue(Queue));
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index f6fc41d750b50..b264f9e0d3e93 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -75,9 +75,12 @@ static cl::opt<std::string>
     OutputFilename("o", cl::desc("Override output filename"), cl::init("-"),
                    cl::value_desc("filename"), cl::cat(LinkCategory));
 
-static cl::opt<bool> Internalize("internalize",
-                                 cl::desc("Internalize linked symbols"),
-                                 cl::cat(LinkCategory));
+static cl::opt<bool>
+    Internalize("internalize",
+                cl::desc("Internalize linked symbols - maintains existing "
+                         "linkage for the first input and converts linkage in"
+                         " all other inputs to `internal`"),
+                cl::cat(LinkCategory));
 
 static cl::opt<bool>
     DisableDITypeMap("disable-debug-info-type-map",
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index f166143b756e8..be5d524886978 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -585,14 +585,17 @@ int main(int argc, char **argv) {
     TheTarget->createNullTargetStreamer(*FFS);
     Str = std::move(FFS);
   } else if (FileType == OFT_AssemblyFile) {
-    IP.reset(TheTarget->createMCInstPrinter(
-        Triple(TripleName), OutputAsmVariant, *MAI, *MCII, *MRI));
+    unsigned AsmVariant = OutputAsmVariant.getNumOccurrences()
+                              ? OutputAsmVariant
+                              : MAI->getAssemblerDialect();
+    IP.reset(TheTarget->createMCInstPrinter(Triple(TripleName), AsmVariant,
+                                            *MAI, *MCII, *MRI));
 
     if (!IP) {
       WithColor::error()
           << "unable to create instruction printer for target triple '"
-          << TheTriple.normalize() << "' with assembly variant "
-          << OutputAsmVariant << ".\n";
+          << TheTriple.normalize() << "' with assembly variant " << AsmVariant
+          << "\n";
       return 1;
     }
 
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index af503d9b82843..2c82dbbb46fac 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -65,6 +65,7 @@ add_llvm_unittest(ADTTests
   PriorityWorklistTest.cpp
   RadixTreeTest.cpp
   RangeAdapterTest.cpp
+  RepeatedTest.cpp
   RewriteBufferTest.cpp
   SCCIteratorTest.cpp
   STLExtrasTest.cpp
diff --git a/llvm/unittests/ADT/RepeatedTest.cpp b/llvm/unittests/ADT/RepeatedTest.cpp
new file mode 100644
index 0000000000000..f55be4b22ee5f
--- /dev/null
+++ b/llvm/unittests/ADT/RepeatedTest.cpp
@@ -0,0 +1,100 @@
+//===- RepeatedTest.cpp - Repeated unit tests -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Repeated.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+using ::testing::Each;
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+namespace llvm {
+namespace {
+
+TEST(RepeatedTest, Construction) {
+  {
+    Repeated<int> Rep(5, 42);
+    EXPECT_EQ(Rep.value(), 42);
+    EXPECT_THAT(Rep, SizeIs(5));
+    EXPECT_EQ(Rep[0], 42);
+    EXPECT_EQ(Rep[4], 42);
+    EXPECT_THAT(Rep, ElementsAre(42, 42, 42, 42, 42));
+  }
+
+  {
+    Repeated<std::string> Rep(3, "hello");
+    EXPECT_EQ(Rep.value(), "hello");
+    EXPECT_THAT(Rep, SizeIs(3));
+  }
+
+  {
+    // Move-only type.
+    Repeated<std::unique_ptr<int>> Rep(1, std::make_unique<int>(42));
+    EXPECT_EQ(*Rep.value(), 42);
+    EXPECT_THAT(Rep, SizeIs(1));
+  }
+
+  {
+    // Empty Rep.
+    Repeated<int> EmptyRep(0, 42);
+    EXPECT_THAT(EmptyRep, IsEmpty());
+  }
+}
+
+TEST(RepeatedTest, CTAD) {
+  static_assert(std::is_same_v<decltype(Repeated(3, 42)), Repeated<int>>);
+  std::string S = "world";
+  Repeated RepStr(2, S);
+  static_assert(std::is_same_v<decltype(RepStr), Repeated<std::string>>);
+  static_assert(
+      std::is_same_v<decltype(Repeated(1, "literal")), Repeated<const char *>>);
+  SUCCEED();
+}
+
+TEST(RepeatedTest, IteratorRandomAccess) {
+  Repeated<int> Rep(10, 7);
+  RepeatedIterator<int> It = Rep.begin();
+
+  EXPECT_EQ(*It, 7);
+  EXPECT_EQ(*(It + 5), 7);
+
+  It += 10;
+  EXPECT_EQ(It, Rep.end());
+  --It;
+  EXPECT_LT(It, Rep.end());
+  EXPECT_EQ(Rep.end() - Rep.begin(), 10);
+  ++It;
+  EXPECT_EQ(It, Rep.end());
+}
+
+TEST(RepeatedTest, ReverseIterator) {
+  Repeated<int> Rep(5, 42);
+  std::vector<int> Reversed(Rep.rbegin(), Rep.rend());
+  EXPECT_THAT(Reversed, SizeIs(5));
+  EXPECT_THAT(Reversed, Each(42));
+}
+
+TEST(RepeatedTest, IteratorTraits) {
+  using It = RepeatedIterator<int>;
+  static_assert(std::is_default_constructible_v<It>);
+  static_assert(std::is_same_v<std::iterator_traits<It>::iterator_category,
+                               std::random_access_iterator_tag>);
+  static_assert(std::is_same_v<std::iterator_traits<It>::value_type, int>);
+  static_assert(
+      std::is_same_v<std::iterator_traits<It>::difference_type, ptrdiff_t>);
+  SUCCEED();
+}
+
+} // anonymous namespace
+} // namespace llvm
diff --git a/llvm/unittests/Analysis/BasicAliasAnalysisTest.cpp b/llvm/unittests/Analysis/BasicAliasAnalysisTest.cpp
index bae1f1c508af3..0541f257b20d2 100644
--- a/llvm/unittests/Analysis/BasicAliasAnalysisTest.cpp
+++ b/llvm/unittests/Analysis/BasicAliasAnalysisTest.cpp
@@ -79,6 +79,7 @@ TEST_F(BasicAATest, AliasInstWithObjectOfImpreciseSize) {
 
   BasicBlock *Entry(BasicBlock::Create(C, "", F));
   B.SetInsertPoint(Entry);
+  B.CreateRetVoid();
 
   Value *IncomingI32Ptr = F->arg_begin();
 
@@ -119,6 +120,7 @@ TEST_F(BasicAATest, AliasInstWithFullObjectOfImpreciseSize) {
   AllocaInst *I8 = B.CreateAlloca(B.getInt8Ty(), B.getInt32(2));
   auto *I8AtUncertainOffset =
       cast<GetElementPtrInst>(B.CreatePtrAdd(I8, ArbitraryI32));
+  B.CreateRetVoid();
 
   auto &AllAnalyses = setupAnalyses();
   BasicAAResult &BasicAA = AllAnalyses.BAA;
diff --git a/llvm/unittests/Analysis/CFGTest.cpp b/llvm/unittests/Analysis/CFGTest.cpp
index 352ee6a1bf43e..7b61413cc48e6 100644
--- a/llvm/unittests/Analysis/CFGTest.cpp
+++ b/llvm/unittests/Analysis/CFGTest.cpp
@@ -392,11 +392,12 @@ TEST_F(IsPotentiallyReachableTest, BranchInsideLoop) {
 TEST_F(IsPotentiallyReachableTest, ModifyTest) {
   ParseAssembly(BranchInsideLoopIR);
 
-  succ_iterator S = succ_begin(&*++M->getFunction("test")->begin());
-  BasicBlock *OldBB = S[0];
-  S[0] = S[1];
+  BasicBlock *LoopBB = &*++M->getFunction("test")->begin();
+  auto *T = cast<CondBrInst>(LoopBB->getTerminator());
+  BasicBlock *OldBB = T->getSuccessor(0);
+  T->setSuccessor(0, T->getSuccessor(1));
   ExpectPath(false);
-  S[0] = OldBB;
+  T->setSuccessor(0, OldBB);
   ExpectPath(true);
 }
 
diff --git a/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp b/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
index 1a8160f14a0a8..4fe3e701d9f83 100644
--- a/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
+++ b/llvm/unittests/Analysis/DomTreeUpdaterTest.cpp
@@ -776,6 +776,7 @@ TEST(DomTreeUpdater, LazyUpdateDeduplicationTest) {
   // CFG Change: remove bb0 -> bb1.
   EXPECT_EQ(BB0->getTerminator()->getNumSuccessors(), 1u);
   BB0->getTerminator()->eraseFromParent();
+  new UnreachableInst(Context, BB0);
 
   // Update the DTU and simulate invalid updates.
   DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB0, BB1},
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 694d04b22b720..f86eee2117e47 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -363,8 +363,8 @@ TEST_F(IR2VecTestFixture, GetInstVec_Symbolic) {
   EXPECT_EQ(AddEmb.size(), 2u);
   EXPECT_EQ(RetEmb.size(), 2u);
 
-  EXPECT_TRUE(AddEmb.approximatelyEquals(Embedding(2, 25.9)));
-  EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 15.7)));
+  EXPECT_TRUE(AddEmb.approximatelyEquals(Embedding(2, 26.1)));
+  EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 15.8)));
 }
 
 TEST_F(IR2VecTestFixture, GetInstVec_FlowAware) {
@@ -376,8 +376,8 @@ TEST_F(IR2VecTestFixture, GetInstVec_FlowAware) {
   EXPECT_EQ(AddEmb.size(), 2u);
   EXPECT_EQ(RetEmb.size(), 2u);
 
-  EXPECT_TRUE(AddEmb.approximatelyEquals(Embedding(2, 25.9)));
-  EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 33.1)));
+  EXPECT_TRUE(AddEmb.approximatelyEquals(Embedding(2, 26.1)));
+  EXPECT_TRUE(RetEmb.approximatelyEquals(Embedding(2, 33.3)));
 }
 
 TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
@@ -387,9 +387,9 @@ TEST_F(IR2VecTestFixture, GetBBVector_Symbolic) {
   const auto &BBVec = Emb->getBBVector(*BB);
 
   EXPECT_EQ(BBVec.size(), 2u);
-  // BB vector should be sum of add and ret: {25.9, 25.9} + {15.7, 15.7} =
-  // {41.6, 41.6}
-  EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 41.6)));
+  // BB vector should be sum of add and ret: {26.1, 26.1} + {15.8, 15.8} =
+  // {41.9, 41.9}
+  EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 41.9)));
 }
 
 TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
@@ -399,9 +399,9 @@ TEST_F(IR2VecTestFixture, GetBBVector_FlowAware) {
   const auto &BBVec = Emb->getBBVector(*BB);
 
   EXPECT_EQ(BBVec.size(), 2u);
-  // BB vector should be sum of add and ret: {25.9, 25.9} + {33.1, 33.1} =
-  // {59.0, 59.0}
-  EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 59.0)));
+  // BB vector should be sum of add and ret: {26.1, 26.1} + {33.3, 33.3} =
+  // {59.4, 59.4}
+  EXPECT_TRUE(BBVec.approximatelyEquals(Embedding(2, 59.4)));
 }
 
 TEST_F(IR2VecTestFixture, GetFunctionVector_Symbolic) {
@@ -412,8 +412,8 @@ TEST_F(IR2VecTestFixture, GetFunctionVector_Symbolic) {
 
   EXPECT_EQ(FuncVec.size(), 2u);
 
-  // Function vector should match BB vector (only one BB): {41.6, 41.6}
-  EXPECT_TRUE(FuncVec.approximatelyEquals(Embedding(2, 41.6)));
+  // Function vector should match BB vector (only one BB): {41.9, 41.9}
+  EXPECT_TRUE(FuncVec.approximatelyEquals(Embedding(2, 41.9)));
 }
 
 TEST_F(IR2VecTestFixture, GetFunctionVector_FlowAware) {
@@ -423,8 +423,8 @@ TEST_F(IR2VecTestFixture, GetFunctionVector_FlowAware) {
   const auto &FuncVec = Emb->getFunctionVector();
 
   EXPECT_EQ(FuncVec.size(), 2u);
-  // Function vector should match BB vector (only one BB): {59.0, 59.0}
-  EXPECT_TRUE(FuncVec.approximatelyEquals(Embedding(2, 59.0)));
+  // Function vector should match BB vector (only one BB): {59.4, 59.4}
+  EXPECT_TRUE(FuncVec.approximatelyEquals(Embedding(2, 59.4)));
 }
 
 TEST_F(IR2VecTestFixture, MultipleComputeEmbeddingsConsistency_Symbolic) {
@@ -480,6 +480,7 @@ static constexpr unsigned MaxPredicateKinds = Vocabulary::MaxPredicateKinds;
 // names and their canonical string keys.
 #define IR2VEC_HANDLE_TYPE_BIMAP(X)                                            \
   X(VoidTyID, VoidTy, "VoidTy")                                                \
+  X(ByteTyID, ByteTy, "ByteTy")                                                \
   X(IntegerTyID, IntegerTy, "IntegerTy")                                       \
   X(FloatTyID, FloatTy, "FloatTy")                                             \
   X(PointerTyID, PointerTy, "PointerTy")                                       \
diff --git a/llvm/unittests/Analysis/MemorySSATest.cpp b/llvm/unittests/Analysis/MemorySSATest.cpp
index a2e4f99a07d22..38c2e41776ed8 100644
--- a/llvm/unittests/Analysis/MemorySSATest.cpp
+++ b/llvm/unittests/Analysis/MemorySSATest.cpp
@@ -91,6 +91,7 @@ TEST_F(MemorySSATest, CreateALoad) {
   B.CreateStore(B.getInt8(16), PointerArg);
   UncondBrInst::Create(Merge, Left);
   UncondBrInst::Create(Merge, Right);
+  ReturnInst::Create(C, Merge);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -129,6 +130,7 @@ TEST_F(MemorySSATest, CreateLoadsAndStoreUpdater) {
   B.CreateBr(Merge);
   B.SetInsertPoint(Right);
   B.CreateBr(Merge);
+  ReturnInst::Create(C, Merge);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -218,6 +220,7 @@ TEST_F(MemorySSATest, CreateALoadUpdater) {
   B.CreateBr(Merge);
   B.SetInsertPoint(Right);
   B.CreateBr(Merge);
+  ReturnInst::Create(C, Merge);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -261,6 +264,7 @@ TEST_F(MemorySSATest, SinkLoad) {
   B.CreateBr(Merge);
   B.SetInsertPoint(Right);
   B.CreateBr(Merge);
+  ReturnInst::Create(C, Merge);
 
   // Load in left block
   B.SetInsertPoint(Left, Left->begin());
@@ -312,6 +316,8 @@ TEST_F(MemorySSATest, MoveAStore) {
   UncondBrInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   B.CreateLoad(B.getInt8Ty(), PointerArg);
+  B.CreateRetVoid();
+
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
@@ -347,6 +353,8 @@ TEST_F(MemorySSATest, MoveAStoreUpdater) {
   UncondBrInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   auto *MergeLoad = B.CreateLoad(B.getInt8Ty(), PointerArg);
+  B.CreateRetVoid();
+
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
@@ -392,6 +400,8 @@ TEST_F(MemorySSATest, MoveAStoreUpdaterMove) {
   UncondBrInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   auto *MergeLoad = B.CreateLoad(B.getInt8Ty(), PointerArg);
+  B.CreateRetVoid();
+
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
@@ -435,6 +445,8 @@ TEST_F(MemorySSATest, MoveAStoreAllAround) {
   UncondBrInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   auto *MergeLoad = B.CreateLoad(B.getInt8Ty(), PointerArg);
+  B.CreateRetVoid();
+
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAUpdater Updater(&MSSA);
@@ -488,6 +500,7 @@ TEST_F(MemorySSATest, RemoveAPhi) {
   UncondBrInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   LoadInst *LoadInst = B.CreateLoad(B.getInt8Ty(), PointerArg);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -532,6 +545,7 @@ TEST_F(MemorySSATest, RemoveMemoryAccess) {
   UncondBrInst::Create(Merge, Right);
   B.SetInsertPoint(Merge);
   LoadInst *LoadInst = B.CreateLoad(B.getInt8Ty(), PointerArg);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -598,6 +612,7 @@ TEST_F(MemorySSATest, TestTripleStore) {
   StoreInst *S1 = B.CreateStore(ConstantInt::get(Int8, 0), Alloca);
   StoreInst *S2 = B.CreateStore(ConstantInt::get(Int8, 1), Alloca);
   StoreInst *S3 = B.CreateStore(ConstantInt::get(Int8, 2), Alloca);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -628,6 +643,7 @@ TEST_F(MemorySSATest, TestStoreAndLoad) {
   Value *Alloca = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   Instruction *SI = B.CreateStore(ConstantInt::get(Int8, 0), Alloca);
   Instruction *LI = B.CreateLoad(Int8, Alloca);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -657,6 +673,7 @@ TEST_F(MemorySSATest, TestStoreDoubleQuery) {
   Type *Int8 = Type::getInt8Ty(C);
   Value *Alloca = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "A");
   StoreInst *SI = B.CreateStore(ConstantInt::get(Int8, 0), Alloca);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -721,6 +738,7 @@ TEST_F(MemorySSATest, PartialWalkerCacheWithPhis) {
 
   B.SetInsertPoint(IfEnd);
   Instruction *BelowPhi = B.CreateStore(Zero, AllocA);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -768,6 +786,7 @@ TEST_F(MemorySSATest, WalkerInvariantLoadOpt) {
 
   Instruction *Store = B.CreateStore(One, AllocA);
   Instruction *Load = B.CreateLoad(Int8, AllocA);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -797,6 +816,7 @@ TEST_F(MemorySSATest, WalkerReopt) {
   Value *AllocaB = B.CreateAlloca(Int8, ConstantInt::get(Int8, 1), "B");
   Instruction *SIB = B.CreateStore(ConstantInt::get(Int8, 0), AllocaB);
   Instruction *LIA = B.CreateLoad(Int8, AllocaA);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -835,6 +855,7 @@ TEST_F(MemorySSATest, MoveAboveMemoryDef) {
   StoreInst *StoreC = B.CreateStore(ConstantInt::get(Int8, 4), C);
   StoreInst *StoreA2 = B.CreateStore(ConstantInt::get(Int8, 4), A);
   LoadInst *LoadC = B.CreateLoad(Int8, C);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -891,6 +912,7 @@ TEST_F(MemorySSATest, Irreducible) {
   B.SetInsertPoint(LoopMainBB);
   B.CreateCondBr(B.getTrue(), LoopStartBB, AfterLoopBB);
   B.SetInsertPoint(AfterLoopBB);
+  B.CreateRetVoid();
   Argument *FirstArg = &*F->arg_begin();
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -923,6 +945,7 @@ TEST_F(MemorySSATest, MoveToBeforeLiveOnEntryInvalidatesCache) {
   Value *A = B.CreateAlloca(B.getInt8Ty());
   StoreInst *StoreA = B.CreateStore(B.getInt8(0), A);
   StoreInst *StoreB = B.CreateStore(B.getInt8(0), A);
+  B.CreateRetVoid();
 
   setupAnalyses();
 
@@ -969,6 +992,7 @@ TEST_F(MemorySSATest, RemovingDefInvalidatesCache) {
   StoreInst *StoreX1 = B.CreateStore(B.getInt8(0), X);
   StoreInst *StoreY = B.CreateStore(B.getInt8(0), Y);
   StoreInst *StoreX2 = B.CreateStore(B.getInt8(0), X);
+  B.CreateRetVoid();
 
   setupAnalyses();
 
@@ -1005,6 +1029,7 @@ TEST_F(MemorySSATest, TestStoreMustAlias) {
   StoreInst *SB2 = B.CreateStore(ConstantInt::get(Int8, 2), AllocaB);
   StoreInst *SA3 = B.CreateStore(ConstantInt::get(Int8, 3), AllocaA);
   StoreInst *SB3 = B.CreateStore(ConstantInt::get(Int8, 3), AllocaB);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -1056,6 +1081,7 @@ TEST_F(MemorySSATest, TestStoreMayAlias) {
   StoreInst *SC2 = B.CreateStore(ConstantInt::get(Int8, 5), AllocaC);
   // Store into arg2, must alias store to arg2 => must
   StoreInst *SB3 = B.CreateStore(ConstantInt::get(Int8, 6), PointerB);
+  B.CreateRetVoid();
   std::initializer_list<StoreInst *> Sts = {SA1, SB1, SC1, SA2, SB2, SC2, SB3};
 
   setupAnalyses();
@@ -1126,6 +1152,7 @@ TEST_F(MemorySSATest, LifetimeMarkersAreClobbers) {
   Instruction *FooStore = B.CreateStore(B.getInt8(0), Foo);
   Instruction *BarStore = B.CreateStore(B.getInt8(0), Bar);
   Instruction *BazMemSet = B.CreateMemSet(Baz, B.getInt8(0), 1, Align(1));
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -1236,6 +1263,7 @@ TEST_F(MemorySSATest, TestOptimizedDefsAreProperUses) {
   StoreInst *StoreA = B.CreateStore(ConstantInt::get(Int8, 0), AllocA);
   StoreInst *StoreB = B.CreateStore(ConstantInt::get(Int8, 1), AllocB);
   StoreInst *StoreA2 = B.CreateStore(ConstantInt::get(Int8, 2), AllocA);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -1319,6 +1347,7 @@ TEST_F(MemorySSATest, TestAddedEdgeToBlockWithPhiNotOpt) {
   UncondBrInst::Create(Exit, Body);
   B.SetInsertPoint(Exit);
   StoreInst *S1 = B.CreateStore(B.getInt8(16), PointerArg);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -1380,6 +1409,7 @@ TEST_F(MemorySSATest, TestAddedEdgeToBlockWithPhiOpt) {
 
   B.SetInsertPoint(Exit);
   StoreInst *S2 = B.CreateStore(B.getInt8(16), PointerArg);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -1452,6 +1482,7 @@ TEST_F(MemorySSATest, TestAddedEdgeToBlockWithNoPhiAddNewPhis) {
   B.SetInsertPoint(FBlock);
   B.CreateStore(B.getInt8(16), PointerArg);
   UncondBrInst::Create(EBlock, FBlock);
+  ReturnInst::Create(C, EBlock);
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -1486,6 +1517,7 @@ TEST_F(MemorySSATest, TestCallClobber) {
   Instruction *StorePointer1 = B.CreateStore(B.getInt8(0), Pointer1);
   Instruction *StorePointer2 = B.CreateStore(B.getInt8(0), Pointer2);
   Instruction *MemSet = B.CreateMemSet(Pointer2, B.getInt8(0), 1, Align(1));
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
@@ -1519,6 +1551,7 @@ TEST_F(MemorySSATest, TestLoadClobber) {
       B.CreateLoad(B.getInt8Ty(), Pointer1, /* Volatile */ true);
   Instruction *LoadPointer2 =
       B.CreateLoad(B.getInt8Ty(), Pointer2, /* Volatile */ true);
+  B.CreateRetVoid();
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 5d7ecbce73750..eab4f88c7fbf7 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -7555,6 +7555,186 @@ TEST_F(OpenMPIRBuilderTest, CreateTaskIfCondition) {
   EXPECT_EQ(OulinedFnCall->getNextNode(), TaskCompleteCall);
 }
 
+TEST_F(OpenMPIRBuilderTest, CreateTaskAffinity) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.Config.IsTargetDevice = false;
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
+    return Error::success();
+  };
+
+  LLVMContext &Ctx = M->getContext();
+  StructType *AffInfoTy = StructType::get(
+      Type::getInt64Ty(Ctx), Type::getInt64Ty(Ctx), Type::getInt32Ty(Ctx));
+
+  // Create [1 x AffInfoTy] as alloca (element alloca is fine too).
+  Value *CountI32 = Builder.getInt32(1);
+  AllocaInst *AffArr =
+      Builder.CreateAlloca(AffInfoTy, Builder.getInt64(1), "omp.affinity_list");
+
+  // Fill entry 0 minimally so the pointer definitely dominates use.
+  Value *Entry0 = Builder.CreateInBoundsGEP(
+      AffInfoTy, AffArr, Builder.getInt64(0), "omp.affinity.entry");
+  Builder.CreateStore(Builder.getInt64(0),
+                      Builder.CreateStructGEP(AffInfoTy, Entry0, 0));
+  Builder.CreateStore(Builder.getInt64(64),
+                      Builder.CreateStructGEP(AffInfoTy, Entry0, 1));
+  Builder.CreateStore(Builder.getInt32(0),
+                      Builder.CreateStructGEP(AffInfoTy, Entry0, 2));
+
+  OpenMPIRBuilder::AffinityData Affinity{CountI32, AffArr};
+
+  BasicBlock *AllocaBB = Builder.GetInsertBlock();
+  BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "alloca.split");
+  OpenMPIRBuilder::LocationDescription Loc(
+      InsertPointTy(BodyBB, BodyBB->getFirstInsertionPt()), DL);
+
+  ASSERT_EXPECTED_INIT(
+      OpenMPIRBuilder::InsertPointTy, AfterIP,
+      OMPBuilder.createTask(
+          Loc, InsertPointTy(AllocaBB, AllocaBB->getFirstInsertionPt()),
+          BodyGenCB,
+          /*Tied=*/true,
+          /*Final=*/nullptr,
+          /*IfCondition=*/nullptr,
+          /*Dependencies=*/{},
+          /*Affinity=*/Affinity,
+          /*Mergeable=*/false,
+          /*EventHandle=*/nullptr,
+          /*Priority=*/nullptr));
+
+  Builder.restoreIP(AfterIP);
+  OMPBuilder.finalize();
+  Builder.CreateRetVoid();
+
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+
+  Function *TaskAllocFn =
+      OMPBuilder.getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
+  Function *RegAffFn = OMPBuilder.getOrCreateRuntimeFunctionPtr(
+      OMPRTL___kmpc_omp_reg_task_with_affinity);
+
+  CallInst *TaskAllocCI = nullptr;
+  CallInst *RegAffCI = nullptr;
+
+  for (auto &I : instructions(F)) {
+    if (auto *CI = dyn_cast<CallInst>(&I)) {
+      if (CI->getCalledFunction() == TaskAllocFn)
+        TaskAllocCI = CI;
+      if (CI->getCalledFunction() == RegAffFn)
+        RegAffCI = CI;
+    }
+  }
+
+  ASSERT_NE(TaskAllocCI, nullptr) << "expected __kmpc_omp_task_alloc call";
+  ASSERT_NE(RegAffCI, nullptr)
+      << "expected __kmpc_omp_reg_task_with_affinity call";
+
+  // Check reg_task_with_affinity signature:
+  //   i32 __kmpc_omp_reg_task_with_affinity(ident_t*, i32 gtid,
+  //                                         kmp_task_t*, i32 naffins,
+  //                                         kmp_task_affinity_info_t*)
+  ASSERT_EQ(RegAffCI->arg_size(), 5u);
+  // naffins
+  EXPECT_TRUE(RegAffCI->getArgOperand(3)->getType()->isIntegerTy(32));
+  // kmp_task_affinity_info_t*
+  EXPECT_TRUE(RegAffCI->getArgOperand(4)->getType()->isPointerTy());
+}
+
+TEST_F(OpenMPIRBuilderTest, CreateIteratorLoop) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  {
+    OpenMPIRBuilder OMPBuilder(*M);
+    OMPBuilder.initialize();
+    F->setName("func.unterminated");
+    IRBuilder<> Builder(BB);
+
+    auto BodyGenCB = [&](InsertPointTy BodyIP, Value *LinearIV) -> Error {
+      Builder.restoreIP(BodyIP);
+      Builder.CreateAdd(LinearIV, Builder.getInt64(1));
+      return Error::success();
+    };
+
+    OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL);
+    ASSERT_EXPECTED_INIT(InsertPointTy, AfterIP,
+                         OMPBuilder.createIteratorLoop(Loc, Builder.getInt64(4),
+                                                       BodyGenCB, "iterator"));
+
+    Builder.restoreIP(AfterIP);
+    Builder.CreateRetVoid();
+
+    EXPECT_EQ(AfterIP.getBlock()->getName(), "omp.it.cont");
+    EXPECT_FALSE(verifyFunction(*F, &errs()));
+  }
+
+  {
+    Function *F2 =
+        Function::Create(F->getFunctionType(), Function::ExternalLinkage,
+                         "func.terminated", M.get());
+    BasicBlock *BB2 = BasicBlock::Create(Ctx, "", F2);
+    OpenMPIRBuilder OMPBuilder(*M);
+    OMPBuilder.initialize();
+    IRBuilder<> Builder(BB2);
+
+    BasicBlock *OrigSucc =
+        BasicBlock::Create(Builder.getContext(), "orig.succ", F2);
+    Builder.CreateBr(OrigSucc);
+
+    auto BodyGenCB = [&](InsertPointTy BodyIP, Value *LinearIV) -> Error {
+      Builder.restoreIP(BodyIP);
+      Builder.CreateAdd(LinearIV, Builder.getInt64(1));
+      return Error::success();
+    };
+
+    OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB2, BB2->end()),
+                                             DL);
+    ASSERT_EXPECTED_INIT(InsertPointTy, AfterIP,
+                         OMPBuilder.createIteratorLoop(Loc, Builder.getInt64(4),
+                                                       BodyGenCB, "iterator"));
+
+    EXPECT_EQ(AfterIP.getBlock()->getName(), "omp.it.cont");
+    auto *ContBr = dyn_cast<BranchInst>(AfterIP.getBlock()->getTerminator());
+    ASSERT_NE(ContBr, nullptr);
+    ASSERT_FALSE(ContBr->isConditional());
+    EXPECT_EQ(ContBr->getSuccessor(0), OrigSucc);
+
+    Builder.SetInsertPoint(OrigSucc);
+    Builder.CreateRetVoid();
+
+    EXPECT_FALSE(verifyFunction(*F2, &errs()));
+  }
+
+  EXPECT_FALSE(verifyModule(*M, &errs()));
+}
+
+TEST_F(OpenMPIRBuilderTest, CreateIteratorLoopInvalidLoopBody) {
+  using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
+  OpenMPIRBuilder OMPBuilder(*M);
+  OMPBuilder.initialize();
+  F->setName("func");
+  IRBuilder<> Builder(BB);
+
+  auto BodyGenCB = [&](InsertPointTy BodyIP, Value *LinearIV) -> Error {
+    Builder.restoreIP(BodyIP);
+    Builder.CreateAdd(LinearIV, Builder.getInt64(1));
+    BasicBlock *BadDest =
+        BasicBlock::Create(Builder.getContext(), "iterator.bad.dest", F);
+    Builder.CreateBr(BadDest);
+    Builder.SetInsertPoint(BadDest);
+    Builder.CreateUnreachable();
+    return Error::success();
+  };
+
+  OpenMPIRBuilder::LocationDescription Loc(Builder.saveIP(), DL);
+  OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = OMPBuilder.createIteratorLoop(
+      Loc, Builder.getInt64(4), BodyGenCB, "iterator");
+  ASSERT_TRUE(errorToBool(AfterIP.takeError()));
+}
+
 TEST_F(OpenMPIRBuilderTest, CreateTaskgroup) {
   using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
   OpenMPIRBuilder OMPBuilder(*M);
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index 21daa5967108c..8c4daf56bbfa4 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -1415,4 +1415,25 @@ TEST_F(IRBuilderTest, finalizeSubprogram) {
   EXPECT_EQ(BarSP->getRetainedNodes()[0], Type);
   EXPECT_TRUE(FooSP->getRetainedNodes().empty());
 }
+
+TEST_F(IRBuilderTest, CreateAggregateRet) {
+  IRBuilder<> Builder(BB);
+  // Terminate the function/block created in SetUp.
+  Builder.CreateRetVoid();
+
+  Type *AggType =
+      StructType::create(Ctx, {Builder.getInt8Ty(), Builder.getInt64Ty()});
+  ConstantInt *RV0 = Builder.getInt8(5);
+  ConstantInt *RV1 = Builder.getInt64(55);
+
+  FunctionType *FTy = FunctionType::get(AggType, /*isVarArg=*/false);
+
+  Function *F1 =
+      Function::Create(FTy, Function::ExternalLinkage, "F2", M.get());
+  BasicBlock *CalleeBB = BasicBlock::Create(Ctx, "", F1);
+  IRBuilder<> CalleeBuilder(CalleeBB);
+  CalleeBuilder.CreateAggregateRet({RV0, RV1});
+
+  EXPECT_FALSE(verifyModule(*M));
+}
 }
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index ca596f7d7ba68..27f28dfe886a8 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -186,18 +186,18 @@ TEST(InstructionsTest, CondBrInst) {
   EXPECT_EQ(One, b1->getCondition());
   ++b;
 
-  // check ELSE
-  EXPECT_EQ(bb1, *b);
-  EXPECT_EQ(bb1, b1->getOperand(1));
-  EXPECT_EQ(bb1, b1->getSuccessor(1));
-  ++b;
-
   // check THEN
   EXPECT_EQ(bb0, *b);
-  EXPECT_EQ(bb0, b1->getOperand(2));
+  EXPECT_EQ(bb0, b1->getOperand(1));
   EXPECT_EQ(bb0, b1->getSuccessor(0));
   ++b;
 
+  // check ELSE
+  EXPECT_EQ(bb1, *b);
+  EXPECT_EQ(bb1, b1->getOperand(2));
+  EXPECT_EQ(bb1, b1->getSuccessor(1));
+  ++b;
+
   EXPECT_EQ(b1->op_end(), b);
 
   // clean up
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 271a06290cc1e..01f3a6b914b0f 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -3120,7 +3120,7 @@ define void @foo(i1 %cond0, i1 %cond2) {
   // Check successors().
   EXPECT_EQ(range_size(Br0->successors()), 2u);
   unsigned SuccIdx = 0;
-  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1, BB2});
+  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
   for (sandboxir::BasicBlock *Succ : Br0->successors())
     EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
 
@@ -3159,7 +3159,7 @@ define void @foo(i1 %cond0, i1 %cond2) {
     EXPECT_TRUE(Br->isConditional());
     EXPECT_EQ(Br->getCondition(), Cond0);
     unsigned SuccIdx = 0;
-    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1, BB2});
     for (sandboxir::BasicBlock *Succ : Br->successors())
       EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
     EXPECT_EQ(Br->getNextNode(), Ret1);
@@ -3171,7 +3171,7 @@ define void @foo(i1 %cond0, i1 %cond2) {
     EXPECT_TRUE(Br->isConditional());
     EXPECT_EQ(Br->getCondition(), Cond0);
     unsigned SuccIdx = 0;
-    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+    SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1, BB2});
     for (sandboxir::BasicBlock *Succ : Br->successors())
       EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
     EXPECT_EQ(Br->getPrevNode(), Ret2);
diff --git a/llvm/unittests/SandboxIR/TrackerTest.cpp b/llvm/unittests/SandboxIR/TrackerTest.cpp
index 9c18247b6b96d..7a69ef004ec98 100644
--- a/llvm/unittests/SandboxIR/TrackerTest.cpp
+++ b/llvm/unittests/SandboxIR/TrackerTest.cpp
@@ -131,7 +131,7 @@ define void @foo(i1 %cond) {
   auto *Br = cast<sandboxir::BranchInst>(&*It++);
 
   unsigned SuccIdx = 0;
-  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB2, BB1});
+  SmallVector<sandboxir::BasicBlock *> ExpectedSuccs({BB1, BB2});
   for (auto *Succ : Br->successors())
     EXPECT_EQ(Succ, ExpectedSuccs[SuccIdx++]);
 
diff --git a/llvm/unittests/Support/YAMLIOTest.cpp b/llvm/unittests/Support/YAMLIOTest.cpp
index 7446c07ccb9a8..ff5744bc7fa54 100644
--- a/llvm/unittests/Support/YAMLIOTest.cpp
+++ b/llvm/unittests/Support/YAMLIOTest.cpp
@@ -78,7 +78,7 @@ namespace yaml {
 TEST(YAMLIO, TestMapRead) {
   FooBar doc;
   {
-    Input yin("---\nfoo:  3\nbar:  5\n...\n");
+    Input yin("---\nfoo:  3\nbar:  5\n...");
     yin >> doc;
 
     EXPECT_FALSE(yin.error());
diff --git a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
index 593991c71d706..81982d0217f71 100644
--- a/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
+++ b/llvm/unittests/Target/AMDGPU/AMDGPUUnitTests.cpp
@@ -168,6 +168,12 @@ static void testDynamicVGPRLimits(StringRef CPUName, StringRef FS,
         << CPUName << " dynamic VGPR block size " << DynamicVGPRBlockSize
         << ":\nOcc    MinVGPR        MaxVGPR\n"
         << Table.str() << '\n';
+    // In dVGPR mode, max VGPR limits do not depend on occupancy:
+    EXPECT_EQ(ST.getMaxNumVGPRs(1, DynamicVGPRBlockSize),
+              ST.getMaxNumVGPRs(ST.getMaxWavesPerEU(), DynamicVGPRBlockSize));
+    EXPECT_EQ(ST.getMinNumVGPRs(1, DynamicVGPRBlockSize), 0u);
+    EXPECT_EQ(ST.getMinNumVGPRs(ST.getMaxWavesPerEU(), DynamicVGPRBlockSize),
+              0u);
   };
 
   testWithBlockSize(16);
diff --git a/llvm/unittests/Target/DirectX/ResourceBindingAnalysisTests.cpp b/llvm/unittests/Target/DirectX/ResourceBindingAnalysisTests.cpp
index 3211c2c702aab..65e1beadf3e38 100644
--- a/llvm/unittests/Target/DirectX/ResourceBindingAnalysisTests.cpp
+++ b/llvm/unittests/Target/DirectX/ResourceBindingAnalysisTests.cpp
@@ -71,7 +71,7 @@ TEST_F(ResourceBindingAnalysisTest, TestOverlap) {
   StringRef Assembly = R"(
 define void @main() {
 entry:
-  %handleA = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding(i32 2, i32 0, i32 -1, i32 100, ptr null)
+  %handleA = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding(i32 2, i32 0, i32 0, i32 100, ptr null)
   %handleB = call target("dx.RawBuffer", float, 0, 0) @llvm.dx.resource.handlefrombinding(i32 2, i32 4, i32 1, i32 0, ptr null)
   ret void
 }
diff --git a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
index 49373d367783b..400f23a932540 100644
--- a/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
+++ b/llvm/unittests/Transforms/IPO/FunctionSpecializationTest.cpp
@@ -207,7 +207,7 @@ TEST_F(FunctionSpecializationTest, SwitchInst) {
   EXPECT_TRUE(Test > 0);
 }
 
-TEST_F(FunctionSpecializationTest, BranchInst) {
+TEST_F(FunctionSpecializationTest, CondBrInst) {
   const char *ModuleString = R"(
     define void @foo(i32 %a, i32 %b, i1 %cond) {
     entry:
diff --git a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
index 88eaa875a803a..6e3e71e5abb45 100644
--- a/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
+++ b/llvm/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -919,7 +919,7 @@ TEST_F(LoopPassManagerTest, LoopChildInsertion) {
                           const char *Name, BasicBlock *BB) {
     auto *Cond = new LoadInst(Type::getInt1Ty(Context), &Ptr, Name,
                               /*isVolatile*/ true, BB);
-    BranchInst::Create(TrueBB, FalseBB, Cond, BB);
+    CondBrInst::Create(Cond, TrueBB, FalseBB, BB);
   };
 
   // Build the pass managers and register our pipeline. We build a single loop
@@ -969,10 +969,10 @@ TEST_F(LoopPassManagerTest, LoopChildInsertion) {
         NewLoop01LatchBB =
             BasicBlock::Create(Context, "loop.0.1.latch", &F, &Loop02PHBB);
         Loop01BB.getTerminator()->replaceUsesOfWith(&Loop01BB, NewLoop010PHBB);
-        BranchInst::Create(NewLoop010BB, NewLoop010PHBB);
+        UncondBrInst::Create(NewLoop010BB, NewLoop010PHBB);
         CreateCondBr(NewLoop01LatchBB, NewLoop010BB, "cond.0.1.0",
                      NewLoop010BB);
-        BranchInst::Create(&Loop01BB, NewLoop01LatchBB);
+        UncondBrInst::Create(&Loop01BB, NewLoop01LatchBB);
         AR.DT.addNewBlock(NewLoop010PHBB, &Loop01BB);
         AR.DT.addNewBlock(NewLoop010BB, NewLoop010PHBB);
         AR.DT.addNewBlock(NewLoop01LatchBB, NewLoop010BB);
@@ -1012,7 +1012,7 @@ TEST_F(LoopPassManagerTest, LoopChildInsertion) {
         auto *NewLoop011BB = BasicBlock::Create(Context, "loop.0.1.1", &F, NewLoop01LatchBB);
         NewLoop010BB->getTerminator()->replaceUsesOfWith(NewLoop01LatchBB,
                                                          NewLoop011PHBB);
-        BranchInst::Create(NewLoop011BB, NewLoop011PHBB);
+        UncondBrInst::Create(NewLoop011BB, NewLoop011PHBB);
         CreateCondBr(NewLoop01LatchBB, NewLoop011BB, "cond.0.1.1",
                      NewLoop011BB);
         AR.DT.addNewBlock(NewLoop011PHBB, NewLoop010BB);
@@ -1122,7 +1122,7 @@ TEST_F(LoopPassManagerTest, LoopPeerInsertion) {
                           const char *Name, BasicBlock *BB) {
     auto *Cond = new LoadInst(Type::getInt1Ty(Context), &Ptr, Name,
                               /*isVolatile*/ true, BB);
-    BranchInst::Create(TrueBB, FalseBB, Cond, BB);
+    CondBrInst::Create(Cond, TrueBB, FalseBB, BB);
   };
 
   // Build the pass managers and register our pipeline. We build a single loop
@@ -1158,7 +1158,7 @@ TEST_F(LoopPassManagerTest, LoopPeerInsertion) {
         L.getParentLoop()->addChildLoop(NewLoop);
         auto *NewLoop01PHBB = BasicBlock::Create(Context, "loop.0.1.ph", &F, &Loop02PHBB);
         auto *NewLoop01BB = BasicBlock::Create(Context, "loop.0.1", &F, &Loop02PHBB);
-        BranchInst::Create(NewLoop01BB, NewLoop01PHBB);
+        UncondBrInst::Create(NewLoop01BB, NewLoop01PHBB);
         CreateCondBr(&Loop02PHBB, NewLoop01BB, "cond.0.1", NewLoop01BB);
         Loop00BB.getTerminator()->replaceUsesOfWith(&Loop02PHBB, NewLoop01PHBB);
         AR.DT.addNewBlock(NewLoop01PHBB, &Loop00BB);
@@ -1216,13 +1216,13 @@ TEST_F(LoopPassManagerTest, LoopPeerInsertion) {
         auto *NewLoop04LatchBB =
             BasicBlock::Create(Context, "loop.0.4.latch", &F, &Loop0LatchBB);
         Loop02BB.getTerminator()->replaceUsesOfWith(&Loop0LatchBB, NewLoop03PHBB);
-        BranchInst::Create(NewLoop03BB, NewLoop03PHBB);
+        UncondBrInst::Create(NewLoop03BB, NewLoop03PHBB);
         CreateCondBr(NewLoop04PHBB, NewLoop03BB, "cond.0.3", NewLoop03BB);
-        BranchInst::Create(NewLoop04BB, NewLoop04PHBB);
+        UncondBrInst::Create(NewLoop04BB, NewLoop04PHBB);
         CreateCondBr(&Loop0LatchBB, NewLoop040PHBB, "cond.0.4", NewLoop04BB);
-        BranchInst::Create(NewLoop040BB, NewLoop040PHBB);
+        UncondBrInst::Create(NewLoop040BB, NewLoop040PHBB);
         CreateCondBr(NewLoop04LatchBB, NewLoop040BB, "cond.0.4.0", NewLoop040BB);
-        BranchInst::Create(NewLoop04BB, NewLoop04LatchBB);
+        UncondBrInst::Create(NewLoop04BB, NewLoop04LatchBB);
         AR.DT.addNewBlock(NewLoop03PHBB, &Loop02BB);
         AR.DT.addNewBlock(NewLoop03BB, NewLoop03PHBB);
         AR.DT.addNewBlock(NewLoop04PHBB, NewLoop03BB);
@@ -1280,7 +1280,7 @@ TEST_F(LoopPassManagerTest, LoopPeerInsertion) {
         AR.LI.addTopLevelLoop(NewLoop);
         auto *NewLoop1PHBB = BasicBlock::Create(Context, "loop.1.ph", &F, &Loop2BB);
         auto *NewLoop1BB = BasicBlock::Create(Context, "loop.1", &F, &Loop2BB);
-        BranchInst::Create(NewLoop1BB, NewLoop1PHBB);
+        UncondBrInst::Create(NewLoop1BB, NewLoop1PHBB);
         CreateCondBr(&Loop2PHBB, NewLoop1BB, "cond.1", NewLoop1BB);
         Loop0BB.getTerminator()->replaceUsesOfWith(&Loop2PHBB, NewLoop1PHBB);
         AR.DT.addNewBlock(NewLoop1PHBB, &Loop0BB);
@@ -1513,11 +1513,11 @@ TEST_F(LoopPassManagerTest, LoopDeletion) {
                 BasicBlock::Create(Context, "loop.0.3.ph", &F, &Loop0LatchBB);
             auto *NewLoop03BB =
                 BasicBlock::Create(Context, "loop.0.3", &F, &Loop0LatchBB);
-            BranchInst::Create(NewLoop03BB, NewLoop03PHBB);
+            UncondBrInst::Create(NewLoop03BB, NewLoop03PHBB);
             auto *Cond =
                 new LoadInst(Type::getInt1Ty(Context), &Ptr, "cond.0.3",
                              /*isVolatile*/ true, NewLoop03BB);
-            BranchInst::Create(&Loop0LatchBB, NewLoop03BB, Cond, NewLoop03BB);
+            CondBrInst::Create(Cond, &Loop0LatchBB, NewLoop03BB, NewLoop03BB);
             Loop02PHBB.getTerminator()->replaceUsesOfWith(&Loop0LatchBB,
                                                           NewLoop03PHBB);
             AR.DT.addNewBlock(NewLoop03PHBB, &Loop02PHBB);
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 896e1de8b32f3..8ceeacf132da1 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -39,10 +39,10 @@ TEST(Local, RecursivelyDeleteDeadPHINodes) {
 
   builder.SetInsertPoint(bb0);
   PHINode    *phi = builder.CreatePHI(Type::getInt32Ty(C), 2);
-  BranchInst *br0 = builder.CreateCondBr(builder.getTrue(), bb0, bb1);
+  CondBrInst *br0 = builder.CreateCondBr(builder.getTrue(), bb0, bb1);
 
   builder.SetInsertPoint(bb1);
-  BranchInst *br1 = builder.CreateBr(bb0);
+  UncondBrInst *br1 = builder.CreateBr(bb0);
 
   phi->addIncoming(phi, bb0);
   phi->addIncoming(phi, bb1);
@@ -80,7 +80,7 @@ TEST(Local, RemoveDuplicatePHINodes) {
                        GlobalValue::ExternalLinkage, "F"));
   BasicBlock *Entry(BasicBlock::Create(C, "", F.get()));
   BasicBlock *BB(BasicBlock::Create(C, "", F.get()));
-  BranchInst::Create(BB, Entry);
+  UncondBrInst::Create(BB, Entry);
 
   B.SetInsertPoint(BB);
 
@@ -100,7 +100,7 @@ TEST(Local, RemoveDuplicatePHINodes) {
 
   P1->addIncoming(P3, BB);
   P2->addIncoming(P4, BB);
-  BranchInst::Create(BB, BB);
+  UncondBrInst::Create(BB, BB);
 
   // Verify that we can eliminate PHIs that become duplicates after chaning PHIs
   // downstream.
@@ -219,8 +219,7 @@ TEST(Local, MergeBasicBlockIntoOnlyPred) {
       BasicBlock *SinglePred = BB->getSinglePredecessor();
       if (!SinglePred || SinglePred == BB || BB->hasAddressTaken())
         continue;
-      BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
-      if (Term && !Term->isConditional())
+      if (isa<UncondBrInst>(SinglePred->getTerminator()))
         MergeBasicBlockIntoOnlyPred(BB, &DTU);
     }
     if (DTU.hasDomTree()) {
diff --git a/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp b/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
index a839d75cd6235..4e51061f1cdec 100644
--- a/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LoopUtilsTest.cpp
@@ -89,9 +89,8 @@ TEST(LoopUtils, DeleteDeadLoopNest) {
         Function::iterator FI = F.begin();
         BasicBlock *Entry = &*(FI++);
         assert(Entry->getName() == "entry" && "Expecting BasicBlock entry");
-        const BranchInst *BI = dyn_cast<BranchInst>(Entry->getTerminator());
+        const UncondBrInst *BI = dyn_cast<UncondBrInst>(Entry->getTerminator());
         assert(BI && "Expecting valid branch instruction");
-        EXPECT_EQ(BI->getNumSuccessors(), (unsigned)1);
         EXPECT_EQ(BI->getSuccessor(0)->getName(), "for.end");
       });
 }
diff --git a/llvm/unittests/Transforms/Utils/ProfDataUtilTest.cpp b/llvm/unittests/Transforms/Utils/ProfDataUtilTest.cpp
index 46a3ecd5d3aa3..e345c07a8c3d2 100644
--- a/llvm/unittests/Transforms/Utils/ProfDataUtilTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ProfDataUtilTest.cpp
@@ -44,7 +44,7 @@ define void @foo(i1 %cond0) {
   Function *F = M->getFunction("foo");
   auto &Entry = F->getEntryBlock();
   auto &I = Entry.front();
-  auto *Branch = dyn_cast<BranchInst>(&I);
+  auto *Branch = dyn_cast<CondBrInst>(&I);
   EXPECT_NE(nullptr, Branch);
   auto *ProfileData = Branch->getMetadata(LLVMContext::MD_prof);
   EXPECT_NE(ProfileData, nullptr);
@@ -72,7 +72,7 @@ define void @foo(i1 %cond0) {
   Function *F = M->getFunction("foo");
   auto &Entry = F->getEntryBlock();
   auto &I = Entry.front();
-  auto *Branch = dyn_cast<BranchInst>(&I);
+  auto *Branch = dyn_cast<CondBrInst>(&I);
   EXPECT_NE(nullptr, Branch);
   auto *ProfileData = Branch->getMetadata(LLVMContext::MD_prof);
   EXPECT_EQ(ProfileData, nullptr);
diff --git a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
index 503d3a0f9bfa9..0c7720e9c4f1f 100644
--- a/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
+++ b/llvm/unittests/Transforms/Utils/SSAUpdaterBulkTest.cpp
@@ -44,6 +44,7 @@ TEST(SSAUpdaterBulk, SimpleMerge) {
   //     %6 = add i32 %3, 6
   //     %7 = add i32 %2, %4
   //     %8 = sub i32 %2, %4
+  //     ret void
   Argument *FirstArg = &*(F->arg_begin());
   BasicBlock *IfBB = BasicBlock::Create(C, "if", F);
   BasicBlock *TrueBB = BasicBlock::Create(C, "true", F);
@@ -68,6 +69,7 @@ TEST(SSAUpdaterBulk, SimpleMerge) {
   auto *I2 = cast<Instruction>(B.CreateAdd(AddOp2, ConstantInt::get(I32Ty, 6)));
   auto *I3 = cast<Instruction>(B.CreateAdd(SubOp1, SubOp2));
   auto *I4 = cast<Instruction>(B.CreateSub(SubOp1, SubOp2));
+  B.CreateRetVoid();
 
   // Now rewrite uses in instructions %5, %6, %7. They need to use a phi, which
   // SSAUpdater should insert into %merge.
diff --git a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
index e97d00dda7f8c..268b9313da882 100644
--- a/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
+++ b/llvm/unittests/Transforms/Utils/ScalarEvolutionExpanderTest.cpp
@@ -82,7 +82,7 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) {
   BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
   BasicBlock *LoopBB = BasicBlock::Create(Context, "loop", F);
   BasicBlock *ExitBB = BasicBlock::Create(Context, "exit", F);
-  BranchInst::Create(LoopBB, EntryBB);
+  UncondBrInst::Create(LoopBB, EntryBB);
   ReturnInst::Create(Context, nullptr, ExitBB);
 
   // loop:                            ; preds = %loop, %entry
@@ -95,8 +95,8 @@ TEST_F(ScalarEvolutionExpanderTest, ExpandPtrTypeSCEV) {
   //   br i1 undef, label %loop, label %exit
 
   const DataLayout &DL = F->getDataLayout();
-  BranchInst *Br = BranchInst::Create(
-      LoopBB, ExitBB, PoisonValue::get(Type::getInt1Ty(Context)), LoopBB);
+  CondBrInst *Br = CondBrInst::Create(
+      PoisonValue::get(Type::getInt1Ty(Context)), LoopBB, ExitBB, LoopBB);
   AllocaInst *Alloca = new AllocaInst(I32Ty, DL.getAllocaAddrSpace(), "alloca",
                                       Br->getIterator());
   ConstantInt *Ci32 = ConstantInt::get(Context, APInt(32, 1));
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
index 3a585e958c3f3..472c04b17863b 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h
@@ -76,7 +76,7 @@ class VPlanTestIRBase : public testing::Test {
                                              {}, PSE);
 
     VPlanTransforms::handleEarlyExits(*Plan, HasUncountableExit);
-    VPlanTransforms::addMiddleCheck(*Plan, true, false);
+    VPlanTransforms::addMiddleCheck(*Plan, false);
 
     VPlanTransforms::createLoopRegions(*Plan);
     return Plan;
@@ -96,7 +96,7 @@ class VPlanTestBase : public testing::Test {
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(C), false);
     F = Function::Create(FTy, GlobalValue::ExternalLinkage, "f", M.get());
     ScalarHeader = BasicBlock::Create(C, "scalar.header", F);
-    BranchInst::Create(ScalarHeader, ScalarHeader);
+    UncondBrInst::Create(ScalarHeader, ScalarHeader);
   }
 
   VPlan &getPlan() {
diff --git a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
index c75d35feadc01..a6790df82d3e4 100644
--- a/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenDAGPatterns.cpp
@@ -2076,7 +2076,10 @@ void TreePatternNode::print(raw_ostream &OS) const {
   for (const ScopedName &Name : NamesAsPredicateArg)
     OS << ":$pred:" << Name.getScope() << ":" << Name.getIdentifier();
 }
-void TreePatternNode::dump() const { print(dbgs()); }
+void TreePatternNode::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
 
 /// isIsomorphicTo - Return true if this node is recursively
 /// isomorphic to the specified node.  For this comparison, the node's
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index cbb7f89bee679..a909d4f0378b8 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -51,6 +51,50 @@ static const std::set<StringRef> NoFoldSet = {
 #include "X86ManualFoldTables.def"
 };
 
+const std::set<StringRef> NoFoldSameMaskPrefixSet = {
+#define NOFOLD_SAME_MASK_PREFIX(PREFIX) #PREFIX,
+#include "X86ManualFoldTables.def"
+};
+
+const std::set<StringRef> NoFoldSameMaskSet = {
+#define NOFOLD_SAME_MASK(INSN) #INSN,
+#include "X86ManualFoldTables.def"
+};
+
+// Check if instruction is unsafe for masked-load folding.
+static bool isNoFoldMaskedInstruction(const CodeGenInstruction *Inst) {
+  StringRef Name = Inst->getName();
+
+  // First check exact instruction name
+  if (NoFoldSameMaskSet.count(Name))
+    return true;
+
+  // Then strip suffixes to get base name for prefix matching
+  // Strip k-register suffix: kz or k
+  if (Name.ends_with("kz"))
+    Name = Name.drop_back(2);
+  else if (Name.ends_with("k"))
+    Name = Name.drop_back(1);
+  else
+    return false; // Not a k-register instruction
+
+  // Strip operand form suffix (check longer patterns first)
+  if (Name.ends_with("rri"))
+    Name = Name.drop_back(3);
+  else if (Name.ends_with("rr") || Name.ends_with("ri"))
+    Name = Name.drop_back(2);
+
+  // Strip vector size suffix: Z128, Z256, or Z
+  if (Name.ends_with("Z128") || Name.ends_with("Z256"))
+    Name = Name.drop_back(4);
+  else if (Name.ends_with("Z"))
+    Name = Name.drop_back(1);
+  else
+    return false; // Not a AVX512 instruction
+
+  return NoFoldSameMaskPrefixSet.count(Name);
+}
+
 static bool isExplicitAlign(const CodeGenInstruction *Inst) {
   return any_of(ExplicitAlign, [Inst](const char *InstStr) {
     return Inst->getName().contains(InstStr);
@@ -195,6 +239,7 @@ class X86FoldTablesEmitter {
   FoldTable BroadcastTable2;
   FoldTable BroadcastTable3;
   FoldTable BroadcastTable4;
+  std::vector<const CodeGenInstruction *> NonFoldableWithSameMaskTable;
 
 public:
   X86FoldTablesEmitter(const RecordKeeper &R) : Records(R), Target(R) {}
@@ -230,6 +275,14 @@ class X86FoldTablesEmitter {
 
     OS << "};\n\n";
   }
+
+  void printTable(const std::vector<const CodeGenInstruction *> &Instructions,
+                  StringRef TableName, raw_ostream &OS) {
+    OS << "static const unsigned " << TableName << "[] = {\n";
+    for (auto Inst : Instructions)
+      OS << "  X86::" << Inst->getName() << ",\n";
+    OS << "};\n\n";
+  }
 };
 } // namespace
 
@@ -644,6 +697,13 @@ void X86FoldTablesEmitter::run(raw_ostream &OS) {
     if (hasRSTRegClass(Inst) || hasPtrTailcallRegClass(Inst))
       continue;
 
+    // Check if this instruction has a prefix in NoFoldSameMaskPrefixSet or is
+    // in NoFoldSameMaskSet (problematic for masked-load folding) and add to
+    // NonFoldableWithSameMaskTable.
+    if (isNoFoldMaskedInstruction(Inst)) {
+      NonFoldableWithSameMaskTable.push_back(Inst);
+    }
+
     // Add all the memory form instructions to MemInsts, and all the register
     // form instructions to RegInsts[Opc], where Opc is the opcode of each
     // instructions. this helps reducing the runtime of the backend.
@@ -749,6 +809,7 @@ void X86FoldTablesEmitter::run(raw_ostream &OS) {
   PRINT_TABLE(BroadcastTable2)
   PRINT_TABLE(BroadcastTable3)
   PRINT_TABLE(BroadcastTable4)
+  PRINT_TABLE(NonFoldableWithSameMaskTable)
 }
 
 static TableGen::Emitter::OptClass<X86FoldTablesEmitter>
diff --git a/llvm/utils/TableGen/X86ManualFoldTables.def b/llvm/utils/TableGen/X86ManualFoldTables.def
index 003712ae124c7..693b3de69ae46 100644
--- a/llvm/utils/TableGen/X86ManualFoldTables.def
+++ b/llvm/utils/TableGen/X86ManualFoldTables.def
@@ -295,3 +295,108 @@ ENTRY(MOVSDrr, MOVLPDrm, TB_NO_REVERSE)
 ENTRY(VMOVSDZrr, VMOVLPDZ128rm, TB_NO_REVERSE)
 ENTRY(VMOVSDrr, VMOVLPDrm, TB_NO_REVERSE)
 #undef ENTRY
+// Prefixes for instructions that are unsafe for masked-load folding.
+// Folding with the same mask is only safe if every active destination
+// element reads only from source elements that are also active under the same mask.
+// These instructions perform element rearrangement/broadcasting that may cause
+// active destination elements to read from masked-off source elements.
+// Matches the following patterns: OPCODE[,Z,Z128,Z256][,rr,ri,rri][k,kz].
+#ifndef NOFOLD_SAME_MASK_PREFIX
+#define NOFOLD_SAME_MASK_PREFIX(PREFIX)
+#endif
+NOFOLD_SAME_MASK_PREFIX(VALIGND)
+NOFOLD_SAME_MASK_PREFIX(VALIGNQ)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTF32X2)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTI32X2)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTSD)
+NOFOLD_SAME_MASK_PREFIX(VBROADCASTSS)
+NOFOLD_SAME_MASK_PREFIX(VDBPSADBW)
+NOFOLD_SAME_MASK_PREFIX(VEXPANDPD)
+NOFOLD_SAME_MASK_PREFIX(VEXPANDPS)
+NOFOLD_SAME_MASK_PREFIX(VGF2P8AFFINEINVQB)
+NOFOLD_SAME_MASK_PREFIX(VGF2P8AFFINEQB)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF32X4)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF32X8)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF64X2)
+NOFOLD_SAME_MASK_PREFIX(VINSERTF64X4)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI32X4)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI32X8)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI64X2)
+NOFOLD_SAME_MASK_PREFIX(VINSERTI64X4)
+NOFOLD_SAME_MASK_PREFIX(VMOVDDUP)
+NOFOLD_SAME_MASK_PREFIX(VMOVSHDUP)
+NOFOLD_SAME_MASK_PREFIX(VMOVSLDUP)
+NOFOLD_SAME_MASK_PREFIX(VMPSADBW)
+NOFOLD_SAME_MASK_PREFIX(VPACKSSDW)
+NOFOLD_SAME_MASK_PREFIX(VPACKSSWB)
+NOFOLD_SAME_MASK_PREFIX(VPACKUSDW)
+NOFOLD_SAME_MASK_PREFIX(VPACKUSWB)
+NOFOLD_SAME_MASK_PREFIX(VPALIGNR)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTB)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTD)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTQ)
+NOFOLD_SAME_MASK_PREFIX(VPBROADCASTW)
+NOFOLD_SAME_MASK_PREFIX(VPCONFLICTD)
+NOFOLD_SAME_MASK_PREFIX(VPCONFLICTQ)
+NOFOLD_SAME_MASK_PREFIX(VPERMB)
+NOFOLD_SAME_MASK_PREFIX(VPERMD)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2B)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2D)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2PD)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2PS)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2Q)
+NOFOLD_SAME_MASK_PREFIX(VPERMI2W)
+NOFOLD_SAME_MASK_PREFIX(VPERMPD)
+NOFOLD_SAME_MASK_PREFIX(VPERMPS)
+NOFOLD_SAME_MASK_PREFIX(VPERMQ)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2B)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2D)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2PD)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2PS)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2Q)
+NOFOLD_SAME_MASK_PREFIX(VPERMT2W)
+NOFOLD_SAME_MASK_PREFIX(VPERMW)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDB)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDD)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDQ)
+NOFOLD_SAME_MASK_PREFIX(VPEXPANDW)
+NOFOLD_SAME_MASK_PREFIX(VPMULTISHIFTQB)
+NOFOLD_SAME_MASK_PREFIX(VPSHUFD)
+NOFOLD_SAME_MASK_PREFIX(VPSHUFHW)
+NOFOLD_SAME_MASK_PREFIX(VPSHUFLW)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHBW)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHQDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKHWD)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLBW)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLQDQ)
+NOFOLD_SAME_MASK_PREFIX(VPUNPCKLWD)
+NOFOLD_SAME_MASK_PREFIX(VSHUFF32X4)
+NOFOLD_SAME_MASK_PREFIX(VSHUFF64X2)
+NOFOLD_SAME_MASK_PREFIX(VSHUFI32X4)
+NOFOLD_SAME_MASK_PREFIX(VSHUFI64X2)
+NOFOLD_SAME_MASK_PREFIX(VSHUFPD)
+NOFOLD_SAME_MASK_PREFIX(VSHUFPS)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKHPD)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKHPS)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKLPD)
+NOFOLD_SAME_MASK_PREFIX(VUNPCKLPS)
+#undef NOFOLD_SAME_MASK_PREFIX
+#ifndef NOFOLD_SAME_MASK
+#define NOFOLD_SAME_MASK(INSN)
+#endif
+// VPERMILPD/VPERMILPS rik forms: Only rik forms are listed here; rrk forms are NOT blocked
+NOFOLD_SAME_MASK(VPERMILPDZ128rik)
+NOFOLD_SAME_MASK(VPERMILPDZ128rikz)
+NOFOLD_SAME_MASK(VPERMILPDZ256rik)
+NOFOLD_SAME_MASK(VPERMILPDZ256rikz)
+NOFOLD_SAME_MASK(VPERMILPDZrik)
+NOFOLD_SAME_MASK(VPERMILPDZrikz)
+NOFOLD_SAME_MASK(VPERMILPSZ128rik)
+NOFOLD_SAME_MASK(VPERMILPSZ128rikz)
+NOFOLD_SAME_MASK(VPERMILPSZ256rik)
+NOFOLD_SAME_MASK(VPERMILPSZ256rikz)
+NOFOLD_SAME_MASK(VPERMILPSZrik)
+NOFOLD_SAME_MASK(VPERMILPSZrikz)
+#undef NOFOLD_SAME_MASK
diff --git a/llvm/utils/git-llvm-push b/llvm/utils/git-llvm-push
index 2297215eb4657..53203d1af230e 100644
--- a/llvm/utils/git-llvm-push
+++ b/llvm/utils/git-llvm-push
@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 """A script to automate the creation and landing of a stack of Pull Requests."""
 
+# TODO: Remove typing workarounds when we use a newer python.
+from __future__ import annotations
+
 import argparse
 import json
 import os
@@ -11,8 +14,6 @@ import time
 import urllib.error
 import urllib.request
 
-# TODO: Remove typing workarounds when we use a newer python.
-from typing import List, Optional, Tuple
 from http.client import HTTPResponse
 from dataclasses import dataclass
 
@@ -46,7 +47,7 @@ class PRAutomatorConfig:
     upstream_remote: str
     prefix: str
     draft: bool
-    labels: List[str]
+    labels: list[str]
     no_merge: bool
     auto_merge: bool
 
@@ -73,13 +74,13 @@ class CommandRunner:
 
     def run_command(
         self,
-        command: List[str],
+        command: list[str],
         check: bool = True,
         capture_output: bool = False,
         text: bool = False,
-        stdin_input: Optional[str] = None,
+        stdin_input: str | None = None,
         read_only: bool = False,
-        env: Optional[dict] = None,
+        env: dict | None = None,
     ) -> subprocess.CompletedProcess:
         if self.dry_run and not read_only:
             self.print(f"[Dry Run] Would run: {' '.join(command)}")
@@ -124,7 +125,7 @@ class GitHubAPI:
         )
 
     def _request(
-        self, method: str, endpoint: str, json_payload: Optional[dict] = None
+        self, method: str, endpoint: str, json_payload: dict | None = None
     ) -> HTTPResponse:
         url = f"{GITHUB_API}{endpoint}"
         self.runner.verbose_print(f"API Request: {method.upper()} {url}")
@@ -153,7 +154,7 @@ class GitHubAPI:
             raise e
 
     def _request_and_parse_json(
-        self, method: str, endpoint: str, json_payload: Optional[dict] = None
+        self, method: str, endpoint: str, json_payload: dict | None = None
     ) -> dict:
         with self._request(method, endpoint, json_payload) as response:
             # Expect a 200 'OK' or 201 'Created' status on success and JSON body.
@@ -165,7 +166,7 @@ class GitHubAPI:
             return {}
 
     def _request_no_content(
-        self, method: str, endpoint: str, json_payload: Optional[dict] = None
+        self, method: str, endpoint: str, json_payload: dict | None = None
     ) -> None:
         with self._request(method, endpoint, json_payload) as response:
             # Expected a 204 No Content status on success, indicating the
@@ -184,7 +185,7 @@ class GitHubAPI:
             )
 
     def _log_unexpected_status(
-        self, expected_statuses: List[int], actual_status: int
+        self, expected_statuses: list[int], actual_status: int
     ) -> None:
         if actual_status not in expected_statuses:
             self.runner.print(
@@ -204,8 +205,10 @@ class GitHubAPI:
         draft: bool,
     ) -> tuple:
         if self.runner.dry_run:
-            self.runner.print(f"[Dry Run] Would create pull request for '{head_branch}'...")
-            return 0
+            self.runner.print(
+                f"[Dry Run] Would create pull request for '{head_branch}'..."
+            )
+            return (0, 0)
 
         self.runner.print(f"Creating pull request for '{head_branch}'...")
         data = {
@@ -218,13 +221,13 @@ class GitHubAPI:
         response_data = self._request_and_parse_json(
             "POST", f"/repos/{LLVM_REPO}/pulls", json_payload=data
         )
-        self.runner.print(f"Pull request created: {response_data.get("html_url")}")
+        self.runner.print(f"Pull request created: {response_data.get('html_url')}")
         return (response_data.get("node_id"), response_data.get("number"))
 
     def get_repo_settings(self) -> dict:
         return self._request_and_parse_json("GET", f"/repos/{LLVM_REPO}")
 
-    def _get_pr_details(self, pr_number: str) -> dict:
+    def _get_pr_details(self, pr_number: int) -> dict:
         """Fetches the JSON details for a given pull request number."""
         return self._request_and_parse_json(
             "GET", f"/repos/{LLVM_REPO}/pulls/{pr_number}"
@@ -233,20 +236,23 @@ class GitHubAPI:
     def add_labels(
         self,
         pr_number: int,
-        labels: List[str],
+        labels: list[str],
     ) -> None:
         if self.runner.dry_run:
-            self.runner.print(f"[Dry Run] Would set labels for #{pr_number}: {' '.join(labels)}")
+            self.runner.print(
+                f"[Dry Run] Would set labels for #{pr_number}: {' '.join(labels)}"
+            )
             return None
 
         self.runner.print(f"Setting labels for #{pr_number}: {' '.join(labels)}")
 
         self._request_and_parse_json(
-            "POST", f"/repos/{LLVM_REPO}/issues/{pr_number}/labels",
+            "POST",
+            f"/repos/{LLVM_REPO}/issues/{pr_number}/labels",
             json_payload={"labels": labels},
         )
 
-    def _attempt_squash_merge(self, pr_number: str) -> bool:
+    def _attempt_squash_merge(self, pr_number: int) -> bool:
         """Attempts to squash merge a PR, returning True on success."""
         try:
             self._request_and_parse_json(
@@ -262,7 +268,7 @@ class GitHubAPI:
             # Re-raise other HTTP errors.
             raise e
 
-    def merge_pr(self, pr_number: int) -> Optional[str]:
+    def merge_pr(self, pr_number: int) -> str | None:
         if self.runner.dry_run:
             self.runner.print(f"[Dry Run] Would merge #{pr_number}")
             return None
@@ -313,7 +319,7 @@ class GitHubAPI:
         self.runner.print("Auto-merge enabled.")
 
     def delete_branch(
-        self, branch_name: str, default_branch: Optional[str] = None
+        self, branch_name: str, default_branch: str | None = None
     ) -> None:
         if default_branch and branch_name == default_branch:
             self.runner.print(
@@ -350,7 +356,7 @@ class LLVMPRAutomator:
         self.config = config
         self.remote = remote
         self.original_branch: str = ""
-        self.created_branches: List[str] = []
+        self.created_branches: list[str] = []
         self.repo_settings: dict = {}
 
     def _get_git_env(self) -> dict:
@@ -431,7 +437,7 @@ class LLVMPRAutomator:
                 )
             raise LlvmPrError("rebase operation failed.") from e
 
-    def _get_commit_stack(self) -> List[str]:
+    def _get_commit_stack(self) -> list[str]:
         target = f"{self.config.upstream_remote}/{self.config.base_branch}"
         result = self.runner.run_command(
             ["git", "rev-list", "--reverse", f"{target}..HEAD"],
@@ -441,7 +447,7 @@ class LLVMPRAutomator:
         )
         return result.stdout.strip().splitlines()
 
-    def _get_commit_details(self, commit_hash: str) -> Tuple[str, str]:
+    def _get_commit_details(self, commit_hash: str) -> tuple[str, str]:
         # Get the subject and body from git show. Insert "\n\n" between to make
         # parsing simple to do w/ split.
         result = self.runner.run_command(
@@ -630,9 +636,9 @@ def main() -> None:
     )
     parser.add_argument(
         "--labels",
-        nargs='*',
+        nargs="*",
         default=[DEFAULT_LABEL],
-        help=f"Set the PR labels (default: {DEFAULT_LABEL})."
+        help=f"Set the PR labels (default: {DEFAULT_LABEL}).",
     )
     merging = parser.add_mutually_exclusive_group()
     merging.add_argument(
diff --git a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
index 27f1f89281f8b..e4d1e9e3d6f2b 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Config/BUILD.gn
@@ -21,6 +21,8 @@ write_cmake_config("Config") {
     "CLANG_CONFIG_FILE_SYSTEM_DIR=",
     "CLANG_CONFIG_FILE_USER_DIR=",
     "CLANG_SPAWN_CC1=",
+    "CLANG_USE_XCSELECT=",
+    "CLANG_XCSELECT_HOST_SDK_POLICY=",
     "DEFAULT_SYSROOT=",
     "GCC_INSTALL_PREFIX=",
     "ENABLE_LINKER_BUILD_ID=",
diff --git a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
index 3a52be9016599..7acd63041b531 100644
--- a/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Driver/BUILD.gn
@@ -16,6 +16,8 @@ static_library("Driver") {
     "//clang/lib/Basic",
     "//clang/lib/Frontend",
     "//clang/lib/Options",
+    "//clang/lib/ScalableStaticAnalysisFramework/Core",
+    "//clang/lib/ScalableStaticAnalysisFramework/Frontend",
     "//llvm/include/llvm/Config:llvm-config",
     "//llvm/lib/BinaryFormat",
     "//llvm/lib/Option",
diff --git a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn
index 707eabf1af70b..60157daa66d40 100644
--- a/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/FrontendTool/BUILD.gn
@@ -12,6 +12,8 @@ static_library("FrontendTool") {
     "//clang/lib/Frontend",
     "//clang/lib/Frontend/Rewrite",
     "//clang/lib/Options",
+    "//clang/lib/ScalableStaticAnalysisFramework/Core",
+    "//clang/lib/ScalableStaticAnalysisFramework/Frontend",
     "//llvm/lib/Option",
     "//llvm/lib/Support",
   ]
diff --git a/llvm/utils/gn/secondary/clang/lib/ScalableStaticAnalysisFramework/Frontend/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/ScalableStaticAnalysisFramework/Frontend/BUILD.gn
new file mode 100644
index 0000000000000..96da539ae24da
--- /dev/null
+++ b/llvm/utils/gn/secondary/clang/lib/ScalableStaticAnalysisFramework/Frontend/BUILD.gn
@@ -0,0 +1,15 @@
+static_library("Frontend") {
+  output_name = "clangScalableStaticAnalysisFrameworkFrontend"
+  configs += [ "//llvm/utils/gn/build:clang_code" ]
+  deps = [
+    "//clang/lib/AST",
+    "//clang/lib/Basic",
+    "//clang/lib/Frontend",
+    "//clang/lib/ScalableStaticAnalysisFramework/Core",
+    "//clang/lib/Sema",
+    "//llvm/lib/Support",
+  ]
+  sources = [
+    "TUSummaryExtractorFrontendAction.cpp",
+  ]
+}
diff --git a/llvm/utils/gn/secondary/clang/test/BUILD.gn b/llvm/utils/gn/secondary/clang/test/BUILD.gn
index 99e509dbdf3ab..d820a23f8f7a1 100644
--- a/llvm/utils/gn/secondary/clang/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/test/BUILD.gn
@@ -57,6 +57,7 @@ write_lit_config("lit_site_cfg") {
     "CLANG_ENABLE_OBJC_REWRITER=0",
     "CLANG_PLUGIN_SUPPORT=0",  # FIXME: Analysis/plugins need global -fPIC
     "CURRENT_TOOLS_DIR=" + rebase_path("$root_out_dir/bin", dir),
+    "CLANG_USE_XCSELECT=0",
     "CLANG_VENDOR_UTI=org.llvm.clang",
     "ENABLE_BACKTRACES=1",
     "ENABLE_SHARED=0",
@@ -163,11 +164,11 @@ group("test") {
     "//clang/tools/clang-refactor",
     "//clang/tools/clang-repl",
     "//clang/tools/clang-scan-deps",
+    "//clang/tools/clang-ssaf-format",
+    "//clang/tools/clang-ssaf-linker",
     "//clang/tools/clang-sycl-linker",
     "//clang/tools/diagtool",
     "//clang/tools/driver:symlinks",
-    "//clang/tools/ssaf-format:clang-ssaf-format",
-    "//clang/tools/ssaf-linker:clang-ssaf-linker",
     "//clang/unittests",
     "//clang/utils/TableGen:clang-tblgen",
     "//clang/utils/hmaptool",
diff --git a/llvm/utils/gn/secondary/clang/tools/ssaf-format/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-ssaf-format/BUILD.gn
similarity index 100%
rename from llvm/utils/gn/secondary/clang/tools/ssaf-format/BUILD.gn
rename to llvm/utils/gn/secondary/clang/tools/clang-ssaf-format/BUILD.gn
diff --git a/llvm/utils/gn/secondary/clang/tools/ssaf-linker/BUILD.gn b/llvm/utils/gn/secondary/clang/tools/clang-ssaf-linker/BUILD.gn
similarity index 100%
rename from llvm/utils/gn/secondary/clang/tools/ssaf-linker/BUILD.gn
rename to llvm/utils/gn/secondary/clang/tools/clang-ssaf-linker/BUILD.gn
diff --git a/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn
index 12dac075fbde0..d5bdf665bdd7f 100644
--- a/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/ScalableStaticAnalysisFramework/BUILD.gn
@@ -8,6 +8,7 @@ unittest("ClangScalableAnalysisTests") {
     "//clang/lib/Basic",
     "//clang/lib/Frontend",
     "//clang/lib/ScalableStaticAnalysisFramework/Core",
+    "//clang/lib/ScalableStaticAnalysisFramework/Frontend",
     "//clang/lib/Serialization",
     "//clang/lib/Tooling",
     "//llvm/lib/Frontend/OpenMP",
@@ -25,6 +26,7 @@ unittest("ClangScalableAnalysisTests") {
     "EntityLinkerTest.cpp",
     "EntityNameTest.cpp",
     "ErrorBuilderTest.cpp",
+    "Frontend/TUSummaryExtractorFrontendActionTest.cpp",
     "ModelStringConversionsTest.cpp",
     "Registries/FancyAnalysisData.cpp",
     "Registries/MockSerializationFormat.cpp",
diff --git a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn
index cf80b715f7097..faed86ef98839 100644
--- a/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/include/lldb/Host/BUILD.gn
@@ -19,6 +19,7 @@ write_cmake_config("Config") {
     "LLDB_ENABLE_CURSES=",
     "CURSES_HAVE_NCURSES_CURSES_H=",
     "LLDB_ENABLE_LUA=",
+    "LLDB_ENABLE_MTE=",
     "LLDB_ENABLE_PYTHON=",
     "LLDB_ENABLE_PYTHON_LIMITED_API=",
     "LLDB_ENABLE_TREESITTER=",
diff --git a/llvm/utils/gn/secondary/lldb/test/BUILD.gn b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
index 272a71d10ed56..cd7a17fa4416a 100644
--- a/llvm/utils/gn/secondary/lldb/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/test/BUILD.gn
@@ -62,6 +62,7 @@ write_lit_cfg("lit_api_site_cfg") {
         rebase_path("$root_build_dir/include/c++/v1"),
     "LIBCXX_GENERATED_INCLUDE_TARGET_DIR=",
     "LLDB_BUILD_INTEL_PT=0",
+    "LLDB_ENABLE_MTE=0",
     "LLDB_TEST_COMMON_ARGS=",
     "LLDB_TEST_USER_ARGS=",
     "LLDB_ENABLE_PYTHON=0",
@@ -123,6 +124,7 @@ write_lit_cfg("lit_shell_site_cfg") {
     "LLDB_BUILD_LLDBRPC=0",  # FIXME: add lldb-rpc-gen target, enable
     "LLDB_ENABLE_LUA=0",  # FIXME: gn arg, use in Config.h
     "LLDB_ENABLE_LZMA=0",  # FIXME: gn arg, use in Config.h
+    "LLDB_ENABLE_MTE=0",
     "LLDB_ENABLE_PYTHON=0",  # FIXME: gn arg, use in Config.h
     "LLDB_HAS_LIBCXX=False",  # FIXME: support this (?)
     "LLDB_IS_64_BITS=1",
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index 9e7d64a5ea8c5..d856642e3f447 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -135,7 +135,9 @@ def addProcess(self, proc):
             return
         needToRunKill = False
         with self._lock:
-            self._procs.append(proc)
+            # just store the pid, rather than the whole proc object.
+            # Holding the proc object keeps resources (eg pipes) open unnecessarily.
+            self._procs.append(proc.pid)
             # Avoid re-entering the lock by finding out if kill needs to be run
             # again here but call it if necessary once we have left the lock.
             # We could use a reentrant lock here instead but this code seems
@@ -175,8 +177,8 @@ def _kill(self):
         the initial call to _kill()
         """
         with self._lock:
-            for p in self._procs:
-                lit.util.killProcessAndChildren(p.pid)
+            for pid in self._procs:
+                lit.util.killProcessAndChildren(pid)
             # Empty the list and note that we've done a pass over the list
             self._procs = []  # Python2 doesn't have list.clear()
             self._doneKillPass = True
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index a5c461662bb37..94ec24f21639f 100644
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -149,6 +149,18 @@ if "%skip-checkout%" == "true" (
 curl -O https://gitlab.gnome.org/GNOME/libxml2/-/archive/v2.9.12/libxml2-v2.9.12.tar.gz || exit /b 1
 tar zxf libxml2-v2.9.12.tar.gz
 
+REM FIXME: It would be preferrable to use zlib-ng here since it is better
+REM        maintained and performs better than zlib, but lld tests currently
+REM        assume the original zlib is used. They need to be fixed first:
+REM        https://github.com/llvm/llvm-project/pull/186630#discussion_r2939953952
+set zlib_version=1.3.2
+curl -LO https://github.com/madler/zlib/releases/download/v%zlib_version%/zlib-%zlib_version%.tar.gz || exit /b 1
+tar zxf zlib-%zlib_version%.tar.gz
+
+set zstd_version=1.5.7
+curl -LO https://github.com/facebook/zstd/releases/download/v%zstd_version%/zstd-%zstd_version%.tar.gz || exit /b 1
+tar zxf zstd-%zstd_version%.tar.gz
+
 REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226.
 REM Common flags for all builds.
 set common_compiler_flags=-DLIBXML_STATIC
@@ -163,6 +175,8 @@ set common_cmake_flags=^
   -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^
   -DLLVM_ENABLE_LIBXML2=FORCE_ON ^
   -DCLANG_ENABLE_LIBXML2=OFF ^
+  -DLLVM_ENABLE_ZLIB=FORCE_ON ^
+  -DLLVM_ENABLE_ZSTD=FORCE_ON ^
   -DCMAKE_C_FLAGS="%common_compiler_flags%" ^
   -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
   -DLLVM_ENABLE_RPMALLOC=ON ^
@@ -213,6 +227,8 @@ call "%vsdevcmd%" -arch=x86 || exit /b 1
 mkdir build32_stage0
 cd build32_stage0
 call :do_build_libxml || exit /b 1
+call :do_build_zlib || exit /b 1
+call :do_build_zstd || exit /b 1
 
 REM Stage0 binaries directory; used in stage1.
 set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
@@ -221,7 +237,11 @@ set cmake_flags=^
   -DLLVM_ENABLE_RPMALLOC=OFF ^
   -DPython3_ROOT_DIR=%PYTHONHOME% ^
   -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
+  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
+  -DZLIB_INCLUDE_DIR=%zlibdir%/include ^
+  -DZLIB_LIBRARY=%zlibdir%/lib/zs.lib ^
+  -Dzstd_INCLUDE_DIR=%zstddir%/include ^
+  -Dzstd_LIBRARY=%zstddir%/lib/zstd_static.lib
 
 cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
 ninja || ninja || ninja || exit /b 1
@@ -274,6 +294,8 @@ call "%vsdevcmd%" -arch=%arch% || exit /b 1
 mkdir build_%arch%_stage0
 cd build_%arch%_stage0
 call :do_build_libxml || exit /b 1
+call :do_build_zlib || exit /b 1
+call :do_build_zstd || exit /b 1
 
 REM Stage0 binaries directory; used in stage1.
 set "stage0_bin_dir=%build_dir%/build_%arch%_stage0/bin"
@@ -282,6 +304,10 @@ set cmake_flags=^
   -DPython3_ROOT_DIR=%PYTHONHOME% ^
   -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
   -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
+  -DZLIB_INCLUDE_DIR=%zlibdir%/include ^
+  -DZLIB_LIBRARY=%zlibdir%/lib/zs.lib ^
+  -Dzstd_INCLUDE_DIR=%zstddir%/include ^
+  -Dzstd_LIBRARY=%zstddir%/lib/zstd_static.lib ^
   -DCLANG_DEFAULT_LINKER=lld
 if "%arch%"=="arm64" (
   set cmake_flags=%cmake_flags% ^
@@ -407,6 +433,38 @@ set "libxmldir=%libxmldir:\=/%"
 cd ..
 exit /b 0
 
+::==============================================================================
+:: Build zlib.
+::==============================================================================
+:do_build_zlib
+mkdir zlibbuild
+cd zlibbuild
+cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^
+  -DZLIB_BUILD_TESTING=OFF -DZLIB_BUILD_SHARED=OFF -DZLIB_BUILD_STATIC=ON ^
+  -DZLIB_INSTALL=ON -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^
+  ../../zlib-%zlib_version% || exit /b 1
+ninja install || exit /b 1
+set zlibdir=%cd%\install
+set "zlibdir=%zlibdir:\=/%"
+cd ..
+exit /b 0
+
+::==============================================================================
+:: Build zstd.
+::==============================================================================
+:do_build_zstd
+mkdir zstdbuild
+cd zstdbuild
+cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^
+  -DZSTD_BUILD_PROGRAMS=OFF -DZSTD_BUILD_TESTS=OFF -DZSTD_BUILD_STATIC=ON ^
+  -DZSTD_BUILD_SHARED=OFF -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^
+  ../../zstd-%zstd_version%/build/cmake || exit /b 1
+ninja install || exit /b 1
+set zstddir=%cd%\install
+set "zstddir=%zstddir:\=/%"
+cd ..
+exit /b 0
+
 ::==============================================================================
 :: Generate a PGO profile.
 ::==============================================================================
diff --git a/mlir/docs/Dialects/TOSA.md b/mlir/docs/Dialects/TOSA.md
index 15a2b459f8b91..695d4d4210c78 100644
--- a/mlir/docs/Dialects/TOSA.md
+++ b/mlir/docs/Dialects/TOSA.md
@@ -113,3 +113,23 @@ scheme vs the other.
 ## Operation definitions
 
 [include "Dialects/TosaOps.md"]
+
+### Operation purity
+Some TOSA operations may exhibit undefined behaviour. In the TOSA specification
+this is indicated by a `REQUIRE` condition in the operation pesudo-code. An
+implementation is not required to detect unpredictable behaviour, see
+[Section 4.3](https://www.mlplatform.org/tosa/tosa_spec_1_0_0.html#_operator_validation_helpers).
+
+If an operation can exhibit undefined behaviour, speculating or reordering it
+may change program behaviour. For example, `INTDIV` may exhibit undefined
+behaviour if the divisor is zero. If such an operation were speculated it could
+be executed in situations where the original program would not have executed
+it.
+
+Therefore, operations that may exhibit undefined behaviour must not declare
+the `AlwaysSpeculatable` trait and should not be treated as `Pure`.
+
+Conversely, most TOSA operations are functional tensor computations and do not
+mutate external system resources. These operations therefore typically declare
+the `NoMemoryEffect` trait. Variable operations are an example of an exception
+to this rule.
diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index 93602a286c9c4..2c167d23a7d9a 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -63,6 +63,12 @@ mlirLLVMFunctionTypeGet(MlirType resultType, intptr_t nArgumentTypes,
 
 MLIR_CAPI_EXPORTED MlirStringRef mlirLLVMFunctionTypeGetName(void);
 
+/// Returns `true` if the type is an LLVM dialect function type.
+MLIR_CAPI_EXPORTED bool mlirTypeIsALLVMFunctionType(MlirType type);
+
+/// Returns the TypeID of an LLVM function type.
+MLIR_CAPI_EXPORTED MlirTypeID mlirLLVMFunctionTypeGetTypeID(void);
+
 /// Returns the number of input types.
 MLIR_CAPI_EXPORTED intptr_t mlirLLVMFunctionTypeGetNumInputs(MlirType type);
 
@@ -70,6 +76,9 @@ MLIR_CAPI_EXPORTED intptr_t mlirLLVMFunctionTypeGetNumInputs(MlirType type);
 MLIR_CAPI_EXPORTED MlirType mlirLLVMFunctionTypeGetInput(MlirType type,
                                                          intptr_t pos);
 
+/// Returns `true` if the function type is variadic.
+MLIR_CAPI_EXPORTED bool mlirLLVMFunctionTypeIsVarArg(MlirType type);
+
 /// Returns the return type of the function type.
 MLIR_CAPI_EXPORTED MlirType mlirLLVMFunctionTypeGetReturnType(MlirType type);
 
@@ -190,6 +199,7 @@ enum MlirLLVMCConv {
   MlirLLVMCConvAMDGPU_Gfx = 100,
   MlirLLVMCConvM68k_INTR = 101,
 };
+
 typedef enum MlirLLVMCConv MlirLLVMCConv;
 
 /// Creates a LLVM CConv attribute.
@@ -205,6 +215,7 @@ enum MlirLLVMComdat {
   MlirLLVMComdatNoDeduplicate = 3,
   MlirLLVMComdatSameSize = 4,
 };
+
 typedef enum MlirLLVMComdat MlirLLVMComdat;
 
 /// Creates a LLVM Comdat attribute.
@@ -226,6 +237,7 @@ enum MlirLLVMLinkage {
   MlirLLVMLinkageExternWeak = 9,
   MlirLLVMLinkageCommon = 10,
 };
+
 typedef enum MlirLLVMLinkage MlirLLVMLinkage;
 
 /// Creates a LLVM Linkage attribute.
@@ -274,6 +286,7 @@ enum MlirLLVMTypeEncoding {
   MlirLLVMTypeEncodingLoUser = 0x80,
   MlirLLVMTypeEncodingHiUser = 0xff,
 };
+
 typedef enum MlirLLVMTypeEncoding MlirLLVMTypeEncoding;
 
 /// Creates a LLVM DIBasicType attribute.
@@ -337,6 +350,7 @@ enum MlirLLVMDIEmissionKind {
   MlirLLVMDIEmissionKindLineTablesOnly = 2,
   MlirLLVMDIEmissionKindDebugDirectivesOnly = 3,
 };
+
 typedef enum MlirLLVMDIEmissionKind MlirLLVMDIEmissionKind;
 
 enum MlirLLVMDINameTableKind {
@@ -345,6 +359,7 @@ enum MlirLLVMDINameTableKind {
   MlirLLVMDINameTableKindNone = 2,
   MlirLLVMDINameTableKindApple = 3,
 };
+
 typedef enum MlirLLVMDINameTableKind MlirLLVMDINameTableKind;
 
 /// Creates a LLVM DICompileUnit attribute.
@@ -456,6 +471,69 @@ MLIR_CAPI_EXPORTED MlirStringRef mlirLLVMDIImportedEntityAttrGetName(void);
 MLIR_CAPI_EXPORTED MlirAttribute
 mlirLLVMDIModuleAttrGetScope(MlirAttribute diModule);
 
+//===----------------------------------------------------------------------===//
+// Metadata Attributes
+//===----------------------------------------------------------------------===//
+
+/// Creates an LLVM MDStringAttr.
+MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMMDStringAttrGet(MlirContext ctx,
+                                                         MlirStringRef value);
+
+/// Returns `true` if the attribute is an LLVM MDStringAttr.
+MLIR_CAPI_EXPORTED bool mlirLLVMAttrIsAMDStringAttr(MlirAttribute attr);
+
+/// Returns the TypeID of MDStringAttr.
+MLIR_CAPI_EXPORTED MlirTypeID mlirLLVMMDStringAttrGetTypeID(void);
+
+/// Returns the string value of an LLVM MDStringAttr.
+MLIR_CAPI_EXPORTED MlirStringRef
+mlirLLVMMDStringAttrGetValue(MlirAttribute attr);
+
+/// Creates an LLVM MDConstantAttr wrapping an attribute.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirLLVMMDConstantAttrGet(MlirContext ctx, MlirAttribute valueAttr);
+
+/// Returns `true` if the attribute is an LLVM MDConstantAttr.
+MLIR_CAPI_EXPORTED bool mlirLLVMAttrIsAMDConstantAttr(MlirAttribute attr);
+
+/// Returns the TypeID of MDConstantAttr.
+MLIR_CAPI_EXPORTED MlirTypeID mlirLLVMMDConstantAttrGetTypeID(void);
+
+/// Returns the attribute value of an LLVM MDConstantAttr.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirLLVMMDConstantAttrGetValue(MlirAttribute attr);
+
+/// Creates an LLVM MDFuncAttr referencing a function symbol.
+MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMMDFuncAttrGet(MlirContext ctx,
+                                                       MlirAttribute name);
+
+/// Returns `true` if the attribute is an LLVM MDFuncAttr.
+MLIR_CAPI_EXPORTED bool mlirLLVMAttrIsAMDFuncAttr(MlirAttribute attr);
+
+/// Returns the TypeID of MDFuncAttr.
+MLIR_CAPI_EXPORTED MlirTypeID mlirLLVMMDFuncAttrGetTypeID(void);
+
+/// Returns the symbol name of an LLVM MDFuncAttr.
+MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMMDFuncAttrGetName(MlirAttribute attr);
+
+/// Creates an LLVM MDNodeAttr.
+MLIR_CAPI_EXPORTED MlirAttribute mlirLLVMMDNodeAttrGet(
+    MlirContext ctx, intptr_t nOperands, MlirAttribute const *operands);
+
+/// Returns `true` if the attribute is an LLVM MDNodeAttr.
+MLIR_CAPI_EXPORTED bool mlirLLVMAttrIsAMDNodeAttr(MlirAttribute attr);
+
+/// Returns the TypeID of MDNodeAttr.
+MLIR_CAPI_EXPORTED MlirTypeID mlirLLVMMDNodeAttrGetTypeID(void);
+
+/// Returns the number of operands in an LLVM MDNodeAttr.
+MLIR_CAPI_EXPORTED intptr_t
+mlirLLVMMDNodeAttrGetNumOperands(MlirAttribute attr);
+
+/// Returns the operand at the given index of an LLVM MDNodeAttr.
+MLIR_CAPI_EXPORTED MlirAttribute
+mlirLLVMMDNodeAttrGetOperand(MlirAttribute attr, intptr_t index);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/mlir/include/mlir/Analysis/Presburger/Matrix.h b/mlir/include/mlir/Analysis/Presburger/Matrix.h
index 15069fa2a55f1..4592efda29f70 100644
--- a/mlir/include/mlir/Analysis/Presburger/Matrix.h
+++ b/mlir/include/mlir/Analysis/Presburger/Matrix.h
@@ -233,6 +233,9 @@ class Matrix {
   /// The left shift operation (i.e. dstPos < srcPos) works in a similar way.
   void moveColumns(unsigned srcPos, unsigned num, unsigned dstPos);
 
+  /// Returns the matrix right-multiplied with `other`.
+  Matrix<T> postMultiply(const Matrix<T> &other) const;
+
 protected:
   /// The current number of rows, columns, and reserved columns. The underlying
   /// data vector is viewed as an nRows x nReservedColumns matrix, of which the
@@ -274,6 +277,15 @@ class IntMatrix : public Matrix<DynamicAPInt> {
   ///    pivot.
   std::pair<IntMatrix, IntMatrix> computeHermiteNormalForm() const;
 
+  /// Given the current matrix M, returns the matrices U, D, V such that
+  /// UMV = D, where D is called the Smith Normal Form (SNF).
+  /// The matrices have the following properties:
+  ///   - U, V are unimodular. In other words, det(U), det(V) are 1 or -1;
+  ///     their inverses also contain integer entries.
+  ///   - D is diagonal.
+  ///   - For all i, the diagonal element D_{i, i} divides D_{i + 1, i + 1}.
+  std::tuple<IntMatrix, IntMatrix, IntMatrix> computeSmithNormalForm() const;
+
   /// Divide the first `nCols` of the specified row by their GCD.
   /// Returns the GCD of the first `nCols` of the specified row.
   DynamicAPInt normalizeRow(unsigned row, unsigned nCols);
diff --git a/mlir/include/mlir/Bindings/Python/Globals.h b/mlir/include/mlir/Bindings/Python/Globals.h
index 8f7085f6024f5..8a7f30fd218dc 100644
--- a/mlir/include/mlir/Bindings/Python/Globals.h
+++ b/mlir/include/mlir/Bindings/Python/Globals.h
@@ -78,10 +78,10 @@ class MLIR_PYTHON_API_EXPORTED PyGlobals {
                            bool replace = false);
 
   /// Adds a concrete implementation dialect class.
-  /// Raises an exception if the mapping already exists.
+  /// Raises an exception if the mapping already exists and replace == false.
   /// This is intended to be called by implementation code.
   void registerDialectImpl(const std::string &dialectNamespace,
-                           nanobind::object pyClass);
+                           nanobind::object pyClass, bool replace = false);
 
   /// Adds a concrete implementation operation class.
   /// Raises an exception if the mapping already exists and replace == false.
diff --git a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
index c0b286494996b..4ebb7e16239f7 100644
--- a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
@@ -143,8 +143,9 @@ Value createProduct(OpBuilder &builder, Location loc, ArrayRef<Value> values);
 Value createProduct(OpBuilder &builder, Location loc, ArrayRef<Value> values,
                     Type resultType);
 
-// Map strings to float types.
-std::optional<FloatType> parseFloatType(MLIRContext *ctx, StringRef name);
+// Map strings to float types. Returns nullptr if the name is not a known
+// floating-point type.
+FloatType parseFloatType(MLIRContext *ctx, StringRef name);
 
 } // namespace arith
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 6b0fd1ed9080e..b5a9e3413ddfd 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -618,6 +618,7 @@ def LaunchIndx : AnyTypeOf<[Index, I32, I64]>;
 
 def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
       GPU_AsyncOpInterface, AttrSizedOperandSegments,
+      DeclareOpInterfaceMethods<SymbolUserOpInterface>,
       AllTypesMatch<["gridSizeX", "gridSizeY", "gridSizeZ", "blockSizeX",
                      "blockSizeY", "blockSizeZ"]>]>,
     Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
index 36acf244865eb..14a4f888bd51d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAttrDefs.td
@@ -1686,4 +1686,75 @@ def UWTableKindAttr : LLVM_Attr<"UWTableKind", "uwtableKind"> {
   let assemblyFormat = "`<` $uwtableKind `>`";
 }
 
+//===----------------------------------------------------------------------===//
+// Metadata Attributes
+//===----------------------------------------------------------------------===//
+//
+// These attributes model LLVM IR metadata nodes (llvm::Metadata and its
+// subclasses). They can be nested to form arbitrary metadata trees and are
+// translated to their LLVM IR counterparts during MLIR-to-LLVM-IR conversion.
+
+def LLVM_MDStringAttr : LLVM_Attr<"MDString", "md_string"> {
+  let summary = "LLVM metadata string";
+  let description = [{
+    Wraps a string as an LLVM metadata node, corresponding to
+    `llvm::MDString` in LLVM IR.
+
+    Example:
+    ```mlir
+    #llvm.md_string<"foo.buffer">
+    ```
+  }];
+  let parameters = (ins "StringAttr":$value);
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def LLVM_MDConstantAttr : LLVM_Attr<"MDConstant", "md_const"> {
+  let summary = "LLVM constant-as-metadata";
+  let description = [{
+    Wraps an attribute as an LLVM metadata node, corresponding to
+    `llvm::ConstantAsMetadata` wrapping a `llvm::Constant*` in LLVM IR.
+    Currently, only integers/IntegerAttrs supported.
+
+    Example:
+    ```mlir
+    #llvm.md_const<42 : i32>
+    ```
+  }];
+  let parameters = (ins "Attribute":$value);
+  let assemblyFormat = "`<` $value `>`";
+}
+
+def LLVM_MDFuncAttr : LLVM_Attr<"MDFunc", "md_func"> {
+  let summary = "LLVM function-as-metadata";
+  let description = [{
+    References a function (or global) symbol as LLVM metadata, corresponding
+    to `llvm::ValueAsMetadata::get(function)` in LLVM IR.
+
+    Example:
+    ```mlir
+    #llvm.md_func<@my_kernel>
+    ```
+  }];
+  let parameters = (ins "FlatSymbolRefAttr":$name);
+  let assemblyFormat = "`<` $name `>`";
+}
+
+def LLVM_MDNodeAttr : LLVM_Attr<"MDNode", "md_node"> {
+  let summary = "LLVM metadata node";
+  let description = [{
+    Represents an LLVM metadata node. The operands
+    can be any combination of metadata attributes: `#llvm.md_string`,
+    `#llvm.md_const`, `#llvm.md_func`, or nested `#llvm.md_node`.
+
+    Example:
+    ```mlir
+    #llvm.md_node<#llvm.md_const<0 : i32>, #llvm.md_string<"foo.buffer">>
+    #llvm.md_node<>
+    ```
+  }];
+  let parameters = (ins OptionalArrayRefParameter<"Attribute">:$operands);
+  let assemblyFormat = "`<` (`>`) : ($operands^ `>`)?";
+}
+
 #endif // LLVMIR_ATTRDEFS
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index e781d4c876315..75c47f087f78e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -2576,4 +2576,53 @@ def LLVM_ModuleFlagsOp
   let hasVerifier = 1;
 }
 
+//===--------------------------------------------------------------------===//
+// NamedMetadataOp
+//===--------------------------------------------------------------------===//
+
+def LLVM_NamedMetadataOp
+    : LLVM_Op<"named_metadata"> {
+  let summary = "Module-level named metadata";
+  let description = [{
+    Represents an LLVM named metadata node (`llvm::NamedMDNode`). Named
+    metadata nodes are module-level metadata that associate a name string
+    with a list of metadata nodes. Each operand must be an `#llvm.md_node`.
+
+    Note: cyclic metadata graphs are not supported. Because metadata attributes
+    are represented as MLIR attributes (which form a tree), there is no way to
+    express a metadata node that directly or transitively references itself.
+    LLVM IR permits such cycles (e.g. `!0 = !{!0}`), but they cannot be
+    represented here and will not round-trip through this op.
+
+    Example:
+    ```mlir
+    llvm.named_metadata "foo.version" [
+      #llvm.md_node<#llvm.md_const<2 : i32>,
+                    #llvm.md_const<9 : i32>,
+                    #llvm.md_const<0 : i32>
+      >
+    ]
+    llvm.named_metadata "foo.kernel" [
+      #llvm.md_node<
+        #llvm.md_func<@my_kernel>,
+        #llvm.md_node<>,
+        #llvm.md_node<
+          #llvm.md_node<#llvm.md_const<0 : i32>,
+                        #llvm.md_string<"foo.buffer">
+          >
+        >
+      >
+    ]
+    ```
+  }];
+  let arguments = (ins StrAttr:$metadata_name, ArrayAttr:$nodes);
+  let assemblyFormat = [{
+    $metadata_name $nodes attr-dict
+  }];
+
+  let llvmBuilder = [{
+    convertNamedMetadataOp($metadata_name, $nodes, builder, moduleTranslation);
+  }];
+}
+
 #endif // LLVMIR_OPS
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 26638b2a644c4..b873f260e7d92 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -169,14 +169,15 @@ def LinalgInlineScalarOperandsPass : Pass<"linalg-inline-scalar-operands"> {
 }
 
 def LinalgFoldIntoElementwisePass : Pass<"linalg-fold-into-elementwise"> {
-  let summary = "Fold transpose ops into elementwise";
+  let summary = "Fold transpose and broadcast ops into elementwise";
   let dependentDialects = ["linalg::LinalgDialect"];
 
   let description = [{
-    Fold transpose ops that feed `linalg.elementwise` into the elementwise op
-    by updating its indexing maps. `linalg.transpose` producers whose consumer
-    indexing map is the identity are absorbed, turning the permutation into
-    the elementwise map itself. Other operands remain untouched.
+    Fold transpose or broadcast op that feeds a `linalg.elementwise` into the
+    elementwise op. `linalg.transpose` and `linalg.broadcast` producers whose
+    consumer indexing map is a projected permutation can be absorbed into the
+    indexing map of the `linalg.elementwise` by composing the producer's map
+    into the elementwise op's indexing map. Other operands remain untouched.
   }];
 }
 
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 12e228bcaeefa..cd842fb1c5392 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -310,9 +310,13 @@ def DecomposeOp : Op<Transform_Dialect, "structured.decompose",
      TransformEachOpTrait,
      ReportTrackingListenerFailuresOpTrait]> {
   let description = [{
-    Decomposes named complex operations, such as higher-dimensional
-    (depthwise) convolutions, into combinations of lower-dimensional equivalents
-    when possible.
+    Decomposes higher-dimensional convolution ops into lower-dimensional
+    equivalents when possible. This operates on both named ops and equivalent
+    `linalg.generic` ops that have convolution-like structure (as determined
+    by `inferConvolutionDims`).
+
+    The transformation always attempts to specialize the result back to a named
+    op when possible.
 
     #### Return modes
 
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index dcb7f1f212207..486ef75b76859 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1640,63 +1640,22 @@ decomposeWinogradOutputTransformOp(RewriterBase &rewriter,
 FailureOr<linalg::GenericOp> deduplicateOperandsAndRemoveDeadResults(
     RewriterBase &rewriter, linalg::GenericOp genericOp, bool removeOutputs);
 
+/// Rewrite convolution/pooling/depthwise ops with size-1 window dimensions
+/// into lower-dimensional ops. Uses `inferConvolutionDims` to work with any
+/// layout and handles both named ops and equivalent linalg.generic ops
+/// uniformly. The result is specialized back to a named op if the input was a
+/// named op.
+/// TODO: Support n-D to (n-1)-D downscaling. Currently it only support 2D->1D
+/// downscaling.
+FailureOr<LinalgOp> downscaleSizeOneWindowedConvolution(RewriterBase &rewriter,
+                                                        LinalgOp op);
+
 //===----------------------------------------------------------------------===//
 // Rewrite patterns wrapping transformations.
 // TODO: every single such pattern should be a close to noop wrapper around a
 // functional-stye API call.
 //===----------------------------------------------------------------------===//
 
-/// Rewrites 2-D convolution ops with size-1 window dimensions into 1-D
-/// convolution ops. Works with both named ops and equivalent generic ops.
-template <typename Conv2DOp, typename Conv1DOp>
-struct DownscaleSizeOneWindowed2DConvolution final
-    : public OpInterfaceRewritePattern<LinalgOp> {
-  using OpInterfaceRewritePattern<LinalgOp>::OpInterfaceRewritePattern;
-
-  FailureOr<Conv1DOp> returningMatchAndRewrite(LinalgOp convOp,
-                                               PatternRewriter &rewriter) const;
-
-  LogicalResult matchAndRewrite(LinalgOp convOp,
-                                PatternRewriter &rewriter) const override {
-    return returningMatchAndRewrite(convOp, rewriter);
-  }
-};
-
-extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp,
-                                                             Conv1DNwcWcfOp>;
-extern template struct DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp,
-                                                             Conv1DNcwFcwOp>;
-
-/// Rewrites 2-D depthwise convolution ops with size-1 (w, kw) or (h, kh)
-/// dimensions into 1-D depthwise convolution ops.
-struct DownscaleDepthwiseConv2DNhwcHwcOp final
-    : public OpInterfaceRewritePattern<LinalgOp> {
-  DownscaleDepthwiseConv2DNhwcHwcOp(MLIRContext *context,
-                                    PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern<LinalgOp>(context, benefit) {}
-
-  FailureOr<DepthwiseConv1DNwcWcOp>
-  returningMatchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const;
-
-  LogicalResult matchAndRewrite(LinalgOp convOp,
-                                PatternRewriter &rewriter) const override {
-    return returningMatchAndRewrite(convOp, rewriter);
-  }
-};
-
-struct DownscaleConv2DOp final : public OpInterfaceRewritePattern<LinalgOp> {
-  DownscaleConv2DOp(MLIRContext *context, PatternBenefit benefit = 1)
-      : OpInterfaceRewritePattern<LinalgOp>(context, benefit) {}
-
-  FailureOr<Conv1DOp> returningMatchAndRewrite(LinalgOp convOp,
-                                               PatternRewriter &rewriter) const;
-
-  LogicalResult matchAndRewrite(LinalgOp convOp,
-                                PatternRewriter &rewriter) const override {
-    return returningMatchAndRewrite(convOp, rewriter);
-  }
-};
-
 ///
 /// Linalg generalization pattern.
 ///
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 25e3dbd29043d..c8030b14e96c1 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -2354,7 +2354,8 @@ def OpenACC_DataOp
 def OpenACC_TerminatorOp
     : OpenACC_Op<"terminator", [Pure, Terminator,
                                 DeclareOpInterfaceMethods<
-                                    RegionBranchTerminatorOpInterface>]> {
+                                    RegionBranchTerminatorOpInterface,
+                                    ["getMutableSuccessorOperands"]>]> {
   let summary = "Generic terminator for OpenACC regions";
 
   let description = [{
diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index a08cf3c95e6ce..6ed85c611983a 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -54,7 +54,7 @@ class SCF_Op<string mnemonic, list<Trait> traits = []> :
 def ConditionOp : SCF_Op<"condition", [
   HasParent<"WhileOp">,
   DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface,
-    ["getSuccessorRegions"]>,
+    ["getSuccessorRegions", "getMutableSuccessorOperands"]>,
   Pure,
   Terminator
 ]> {
@@ -904,7 +904,8 @@ def ParallelOp : SCF_Op<"parallel",
 
 def ReduceOp : SCF_Op<"reduce", [
     Terminator, HasParent<"ParallelOp">, RecursiveMemoryEffects,
-    DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>]> {
+    DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface,
+                              ["getMutableSuccessorOperands"]>]> {
   let summary = "reduce operation for scf.parallel";
   let description = [{
     The `scf.reduce` operation is the terminator for `scf.parallel` operations. It can model
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 1498fad2f08e0..9df17ed89b818 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -549,7 +549,7 @@ class Tosa_ElementwiseOp<string mnemonic, list<Trait> traits = []> :
               ResultsBroadcastableShape,
               TosaElementwiseOperator,
               SameOperandsAndResultRank,
-              Pure])> {}
+              NoMemoryEffect])> {}
 
 class Tosa_ElementwiseUnaryOp<string mnemonic, list<Trait> traits = []> :
     Tosa_ElementwiseOp<mnemonic, !listconcat(traits, [
@@ -557,16 +557,16 @@ class Tosa_ElementwiseUnaryOp<string mnemonic, list<Trait> traits = []> :
               SameOperandsAndResultElementType])> {}
 
 class Tosa_InferTensorTypeOp<string mnemonic, list<Trait> traits = []>
-    : Tosa_Op<mnemonic, !listconcat(traits, [InferTensorTypeAdaptor, Pure])> {}
+    : Tosa_Op<mnemonic, !listconcat(traits, [InferTensorTypeAdaptor])> {}
 
 class Tosa_InferShapedTypeOp<string mnemonic, list<Trait> traits = []>
-    : Tosa_Op<mnemonic, !listconcat(traits, [InferShapedTypeOpAdaptor, Pure])> {}
+    : Tosa_Op<mnemonic, !listconcat(traits, [InferShapedTypeOpAdaptor])> {}
 
 // The "SameVariadicOperandSize" trait allows us to pass optional arguments
 // for multiple zero points in convolution ops.
 class Tosa_ConvOp<string mnemonic, list<Trait> traits = []>
     : Tosa_InferShapedTypeOp<mnemonic, !listconcat(traits,
-      [SameVariadicOperandSize])> {
+      [SameVariadicOperandSize, NoMemoryEffect])> {
   let assemblyFormat =
       "operands attr-dict `:` functional-type(operands, results)";
 }
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 0005b6f5a0c63..cab2bccfc27b3 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -32,13 +32,16 @@ include "mlir/Dialect/Tosa/IR/TosaOpBase.td"
 //===----------------------------------------------------------------------===//
 // Operator: argmax
 //===----------------------------------------------------------------------===//
-def Tosa_ArgMaxOp : Tosa_InferShapedTypeOp<"argmax"> {
+def Tosa_ArgMaxOp : Tosa_InferShapedTypeOp<"argmax", [NoMemoryEffect]> {
   let summary = "Perform argmax on the input.";
 
   let description = [{
     This returns the index with the largest value across the given axis of the
     input tensor. If multiple locations have equal values, returns the first
     match along the search axis.
+
+    This operation is not pure. Undefined behaviour may occur if the max index
+    is out of bounds for the output data type.
   }];
 
   let arguments = (ins
@@ -70,7 +73,7 @@ def Tosa_AccType : AnyTypeOf<[I<32>, I<48>, F16, F32]>;
 //===----------------------------------------------------------------------===//
 // Operator: avg_pool2d
 //===----------------------------------------------------------------------===//
-def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d"> {
+def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d", [NoMemoryEffect]> {
   let summary = "Performs average pooling on the input.";
 
   let description = [{
@@ -79,6 +82,9 @@ def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d"> {
     the mean value being placed in the output tensor. When calculating the
     average, only the number of valid input tensor values, but not padding, are
     used to calculate the divisor.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -125,6 +131,9 @@ def Tosa_Conv2DOp : Tosa_ConvOp<"conv2d"> {
     Performs a 2D convolution over the given tensor input, using the weight
     tensor. Implementations may choose to skip calculation of multiplies in
     the padding area.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -166,13 +175,16 @@ def Tosa_Conv2DOp : Tosa_ConvOp<"conv2d"> {
 //===----------------------------------------------------------------------===//
 // Operator: conv2d_block_scaled
 //===----------------------------------------------------------------------===//
-def Tosa_Conv2DBlockScaledOp : Tosa_InferShapedTypeOp<"conv2d_block_scaled"> {
+def Tosa_Conv2DBlockScaledOp : Tosa_InferShapedTypeOp<"conv2d_block_scaled", [NoMemoryEffect]> {
   let summary = "Performs two dimensional convolution using block scaled tensors.";
 
   let description = [{
     Performs a 2D convolution over the given input data and scales, using
     the weight data and scales. Implementations may choose to skip calculation
     of multiplies in the padding area.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -209,6 +221,9 @@ def Tosa_Conv3DOp : Tosa_ConvOp<"conv3d"> {
   let description = [{
     Performs a 3D convolution over the given input tensor. Implementations
     may choose to skip calculation of multiplies in the padding area.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -255,6 +270,9 @@ def Tosa_DepthwiseConv2DOp : Tosa_ConvOp<"depthwise_conv2d"> {
     Performs 2D convolutions separately over each channel of the given tensor
     input, using the weight tensor. Implementations may choose to skip
     calculation of multiplies in the padding area.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -299,7 +317,8 @@ def Tosa_DepthwiseConv2DOp : Tosa_ConvOp<"depthwise_conv2d"> {
 def Tosa_FFT2dOp : Tosa_InferShapedTypeOp<"fft2d", [
     SameOperandsAndResultElementType,
     SameOperandsAndResultShape,
-    ResultsAreFloatLike]> {
+    ResultsAreFloatLike,
+    Pure]> {
   let summary = "Performs FFT2D operation on the input.";
 
   let description = [{
@@ -348,11 +367,14 @@ def Tosa_FFT2dOp : Tosa_InferShapedTypeOp<"fft2d", [
 //===----------------------------------------------------------------------===//
 // Operator: matmul
 //===----------------------------------------------------------------------===//
-def Tosa_MatMulOp : Tosa_InferShapedTypeOp<"matmul"> {
+def Tosa_MatMulOp : Tosa_InferShapedTypeOp<"matmul", [NoMemoryEffect]> {
   let summary = "Matrix multiplication operator.";
 
   let description = [{
     Performs two dimensional matrix multiplications.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -388,7 +410,7 @@ def Tosa_MatMulOp : Tosa_InferShapedTypeOp<"matmul"> {
 //===----------------------------------------------------------------------===//
 // Operator: matmul_t_block_scaled
 //===----------------------------------------------------------------------===//
-def Tosa_MatmulTBlockScaledOp : Tosa_InferShapedTypeOp<"matmul_t_block_scaled"> {
+def Tosa_MatmulTBlockScaledOp : Tosa_InferShapedTypeOp<"matmul_t_block_scaled", [NoMemoryEffect]> {
   let summary = "Performs two dimensional matrix multiplications using block scaled tensors.";
 
   let description = [{
@@ -396,6 +418,9 @@ def Tosa_MatmulTBlockScaledOp : Tosa_InferShapedTypeOp<"matmul_t_block_scaled">
     dimension is always the the last dimension of the tensor, so the result is effectively
     a matrix multiply of A by the transposed B matrix. If the N dimension of input B is of
     size 1, the B matrix will be broadcast.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -422,7 +447,7 @@ def Tosa_MatmulTBlockScaledOp : Tosa_InferShapedTypeOp<"matmul_t_block_scaled">
 //===----------------------------------------------------------------------===//
 // Operator: max_pool2d
 //===----------------------------------------------------------------------===//
-def Tosa_MaxPool2dOp : Tosa_InferShapedTypeOp<"max_pool2d"> {
+def Tosa_MaxPool2dOp : Tosa_InferShapedTypeOp<"max_pool2d", [Pure]> {
   let summary = "Performs max pooling on the input.";
 
   let description = [{
@@ -460,7 +485,8 @@ def Tosa_MaxPool2dOp : Tosa_InferShapedTypeOp<"max_pool2d"> {
 //===----------------------------------------------------------------------===//
 def Tosa_RFFT2dOp : Tosa_InferShapedTypeOp<"rfft2d", [
     SameOperandsAndResultElementType,
-    ResultsAreFloatLike]> {
+    ResultsAreFloatLike,
+    Pure]> {
   let summary = "Performs RFFT2D operation on the input.";
 
   let description = [{
@@ -515,6 +541,9 @@ def Tosa_TransposeConv2DOp : Tosa_ConvOp<"transpose_conv2d"> {
     Performs a 2D transposed convolution over the given tensor input, using the
     weights tensor. Implementations may choose to skip calculation of multiplies
     by zero at fractional input positions.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -557,7 +586,7 @@ def Tosa_TransposeConv2DOp : Tosa_ConvOp<"transpose_conv2d"> {
 //===----------------------------------------------------------------------===//
 // Operator: clamp
 //===----------------------------------------------------------------------===//
-def Tosa_ClampOp : Tosa_ElementwiseUnaryOp<"clamp"> {
+def Tosa_ClampOp : Tosa_ElementwiseUnaryOp<"clamp", [Pure]> {
   let summary = "Computes clamp(features, min, max).";
 
   let description = [{
@@ -592,7 +621,7 @@ def Tosa_ClampOp : Tosa_ElementwiseUnaryOp<"clamp"> {
 //===----------------------------------------------------------------------===//
 // Operator: erf
 //===----------------------------------------------------------------------===//
-def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf"> {
+def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf", [Pure]> {
   let summary = "Computes gauss error function of input.";
 
   let description = [{
@@ -621,7 +650,7 @@ def Tosa_ErfOp : Tosa_ElementwiseUnaryOp<"erf"> {
 //===----------------------------------------------------------------------===//
 // Operator: sigmoid
 //===----------------------------------------------------------------------===//
-def Tosa_SigmoidOp : Tosa_ElementwiseUnaryOp<"sigmoid"> {
+def Tosa_SigmoidOp : Tosa_ElementwiseUnaryOp<"sigmoid", [Pure]> {
   let summary = "Computes elementwise sigmoid of input.";
 
   let description = [{
@@ -653,7 +682,7 @@ def Tosa_SigmoidOp : Tosa_ElementwiseUnaryOp<"sigmoid"> {
 //===----------------------------------------------------------------------===//
 // Operator: tanh
 //===----------------------------------------------------------------------===//
-def Tosa_TanhOp : Tosa_ElementwiseUnaryOp<"tanh"> {
+def Tosa_TanhOp : Tosa_ElementwiseUnaryOp<"tanh", [Pure]> {
   let summary = "Computes elementwise hyperbolic tangent of input.";
 
   let description = [{
@@ -697,6 +726,9 @@ def Tosa_AddOp : Tosa_ElementwiseOp<"add", [
     Elementwise addition of input1 and input2. Axis of size 1 will be broadcast,
     as necessary. Rank of input tensors must match.
 
+    This operation is not pure. Undefined behaviour may occur if the calculated
+    result overflows.
+
     Example:
 
     ```mlir
@@ -738,6 +770,9 @@ def Tosa_ArithmeticRightShiftOp : Tosa_ElementwiseOp<"arithmetic_right_shift",
     Elementwise arithmetic right shift of input1 by the amount specified in
     input2. Axis of size 1 will be broadcast, as necessary. Rank of input tensors
     must match.
+
+    This operation is not pure. Undefined behaviour may occur if the specified
+    shift is out of range.
   }];
 
   let arguments = (ins
@@ -763,7 +798,8 @@ def Tosa_ArithmeticRightShiftOp : Tosa_ElementwiseOp<"arithmetic_right_shift",
 //===----------------------------------------------------------------------===//
 def Tosa_BitwiseAndOp : Tosa_ElementwiseOp<"bitwise_and", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Bitwise AND operator.";
 
   let description = [{
@@ -793,7 +829,8 @@ def Tosa_BitwiseAndOp : Tosa_ElementwiseOp<"bitwise_and", [
 //===----------------------------------------------------------------------===//
 def Tosa_BitwiseOrOp : Tosa_ElementwiseOp<"bitwise_or", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Bitwise OR operator.";
 
   let description = [{
@@ -823,7 +860,8 @@ def Tosa_BitwiseOrOp : Tosa_ElementwiseOp<"bitwise_or", [
 //===----------------------------------------------------------------------===//
 def Tosa_BitwiseXorOp : Tosa_ElementwiseOp<"bitwise_xor", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Bitwise XOR operator.";
 
   let description = [{
@@ -860,6 +898,8 @@ def Tosa_IntDivOp : Tosa_ElementwiseOp<"intdiv", [SameOperandsAndResultElementTy
     divide is truncated towards zero. Expected use is for operations on
     non-scaled integers. Floating point divide should use RECIPROCAL and MUL.
     Quantized integer divide should use TABLE (for 1/x) and MUL.
+
+    This operation is not pure. Undefined behaviour may occur on division by zero.
   }];
 
   let arguments = (ins
@@ -886,7 +926,8 @@ def Tosa_IntDivOp : Tosa_ElementwiseOp<"intdiv", [SameOperandsAndResultElementTy
 //===----------------------------------------------------------------------===//
 def Tosa_LogicalAndOp : Tosa_ElementwiseOp<"logical_and", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Returns the truth value of input1 AND input2 element-wise.";
 
   let description = [{
@@ -922,6 +963,9 @@ def Tosa_LogicalLeftShiftOp : Tosa_ElementwiseOp<"logical_left_shift",
     Elementwise logical left-shift of input1 by the amount specified in input2.
     Axis of size 1 will be broadcast, as necessary.
     Rank of input tensors must match.
+
+    This operation is not pure. Undefined behaviour may occur if the specified
+    shift is out of range.
   }];
 
   let arguments = (ins
@@ -952,6 +996,9 @@ def Tosa_LogicalRightShiftOp : Tosa_ElementwiseOp<"logical_right_shift",
     Elementwise logical right shift of input1 by the amount specified in input2.
     Axis of size 1 will be broadcast, as necessary. Rank of input tensors must
     match.
+
+    This operation is not pure. Undefined behaviour may occur if the specified
+    shift is out of range.
   }];
 
   let arguments = (ins
@@ -976,7 +1023,8 @@ def Tosa_LogicalRightShiftOp : Tosa_ElementwiseOp<"logical_right_shift",
 //===----------------------------------------------------------------------===//
 def Tosa_LogicalOrOp : Tosa_ElementwiseOp<"logical_or", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Returns the truth value of x OR y element-wise.";
 
   let description = [{
@@ -1006,7 +1054,8 @@ def Tosa_LogicalOrOp : Tosa_ElementwiseOp<"logical_or", [
 //===----------------------------------------------------------------------===//
 def Tosa_LogicalXorOp : Tosa_ElementwiseOp<"logical_xor", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Returns the truth value of input1 XOR input2 element-wise.";
 
   let description = [{
@@ -1036,7 +1085,8 @@ def Tosa_LogicalXorOp : Tosa_ElementwiseOp<"logical_xor", [
 //===----------------------------------------------------------------------===//
 def Tosa_MaximumOp : Tosa_ElementwiseOp<"maximum", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Elementwise Maximum.";
 
   let description = [{
@@ -1066,7 +1116,8 @@ def Tosa_MaximumOp : Tosa_ElementwiseOp<"maximum", [
 //===----------------------------------------------------------------------===//
 def Tosa_MinimumOp : Tosa_ElementwiseOp<"minimum", [
     Commutative,
-    SameOperandsAndResultElementType]> {
+    SameOperandsAndResultElementType,
+    Pure]> {
   let summary = "Elementwise Minimum.";
 
   let description = [{
@@ -1098,13 +1149,16 @@ def Tosa_MinimumOp : Tosa_ElementwiseOp<"minimum", [
 def Tosa_MulOp : Tosa_Op<"mul", [
     DeclareOpInterfaceMethods<InferShapedTypeOpInterface,
                               ["inferReturnTypeComponents"]>,
-    Pure]> {
+    NoMemoryEffect]> {
   let summary = "Multiplication operator.";
 
   let description = [{
     Elementwise multiplication (Hadamard product) of input1 and input2.
     Axis of size 1 will be broadcast, as necessary. Rank of input tensors must
     match.
+
+    This operation is not pure. Undefined behaviour may occur if the specifed
+    shift is out of range or the result overflows.
   }];
 
   let arguments = (ins
@@ -1140,6 +1194,9 @@ def Tosa_PowOp : Tosa_ElementwiseOp<"pow", [SameOperandsAndResultElementType]> {
     Elementwise input1 value raised to the power of input2.
     Axis of size 1 will be broadcast, as necessary. Rank of input tensors must
     match.
+
+    This operation is not pure. Undefined behaviour may occur if specified
+    exponent is negative.
   }];
 
   let arguments = (ins
@@ -1168,6 +1225,9 @@ def Tosa_SubOp : Tosa_ElementwiseOp<"sub", [SameOperandsAndResultElementType]> {
   let description = [{
     Elementwise subtraction of input1 and input2. Axis of size 1 will be
     broadcast as necessary. Rank of input tensors must match.
+
+    This operation is not pure. Undefined behaviour may occur if the calculated
+    result underflows.
   }];
 
   let arguments = (ins
@@ -1192,7 +1252,7 @@ def Tosa_SubOp : Tosa_ElementwiseOp<"sub", [SameOperandsAndResultElementType]> {
 //===----------------------------------------------------------------------===//
 // Operator: table
 //===----------------------------------------------------------------------===//
-def Tosa_TableOp : Tosa_InferShapedTypeOp<"table"> {
+def Tosa_TableOp : Tosa_InferShapedTypeOp<"table", [NoMemoryEffect]> {
   let summary = "Table lookup operator.";
 
   let description = [{
@@ -1210,6 +1270,9 @@ def Tosa_TableOp : Tosa_InferShapedTypeOp<"table"> {
     * Use the TABLE operator to produce a fixed point 16.7 interpolated result
     * Use RESCALE (in_t=int32_t, out_t=int16_t, scale=1<<14, shift=21) to
       scale the output to int16_t range (or alternate scale as required)
+
+    This operation is not pure. Undefined behaviour may occur if the calculated
+    slope is out of range.
   }];
 
   let arguments = (ins
@@ -1243,12 +1306,15 @@ def Tosa_TableOp : Tosa_InferShapedTypeOp<"table"> {
 //===----------------------------------------------------------------------===//
 // Operator: abs
 //===----------------------------------------------------------------------===//
-def Tosa_AbsOp : Tosa_ElementwiseUnaryOp<"abs"> {
+def Tosa_AbsOp : Tosa_ElementwiseUnaryOp<"abs", [NoMemoryEffect]> {
   let summary = "Elementwise abs operator.";
 
   let description = [{
     Elementwise absolute value operation.
 
+    This operation is not pure. Undefined behaviour may occur if the
+    calculated result underflows.
+
     Example:
 
     ```mlir
@@ -1277,7 +1343,7 @@ def Tosa_AbsOp : Tosa_ElementwiseUnaryOp<"abs"> {
 //===----------------------------------------------------------------------===//
 // Operator: bitwise_not
 //===----------------------------------------------------------------------===//
-def Tosa_BitwiseNotOp : Tosa_ElementwiseUnaryOp<"bitwise_not"> {
+def Tosa_BitwiseNotOp : Tosa_ElementwiseUnaryOp<"bitwise_not", [Pure]> {
   let summary = "Bitwise NOT operator.";
 
   let description = [{
@@ -1303,7 +1369,7 @@ def Tosa_BitwiseNotOp : Tosa_ElementwiseUnaryOp<"bitwise_not"> {
 //===----------------------------------------------------------------------===//
 // Operator: ceil
 //===----------------------------------------------------------------------===//
-def Tosa_CeilOp : Tosa_ElementwiseUnaryOp<"ceil"> {
+def Tosa_CeilOp : Tosa_ElementwiseUnaryOp<"ceil", [Pure]> {
   let summary = "Elementwise ceil operator.";
 
   let description = [{
@@ -1329,7 +1395,7 @@ def Tosa_CeilOp : Tosa_ElementwiseUnaryOp<"ceil"> {
 //===----------------------------------------------------------------------===//
 // Operator: clz
 //===----------------------------------------------------------------------===//
-def Tosa_ClzOp : Tosa_ElementwiseUnaryOp<"clz"> {
+def Tosa_ClzOp : Tosa_ElementwiseUnaryOp<"clz", [Pure]> {
   let summary = "Elementwise count leading zero operator.";
 
   let description = [{
@@ -1355,7 +1421,7 @@ def Tosa_ClzOp : Tosa_ElementwiseUnaryOp<"clz"> {
 //===----------------------------------------------------------------------===//
 // Operator: cos
 //===----------------------------------------------------------------------===//
-def Tosa_CosOp : Tosa_ElementwiseUnaryOp<"cos"> {
+def Tosa_CosOp : Tosa_ElementwiseUnaryOp<"cos", [Pure]> {
   let summary = "Elementwise cos operator.";
 
   let description = [{
@@ -1381,7 +1447,7 @@ def Tosa_CosOp : Tosa_ElementwiseUnaryOp<"cos"> {
 //===----------------------------------------------------------------------===//
 // Operator: exp
 //===----------------------------------------------------------------------===//
-def Tosa_ExpOp : Tosa_ElementwiseUnaryOp<"exp"> {
+def Tosa_ExpOp : Tosa_ElementwiseUnaryOp<"exp", [Pure]> {
   let summary = "Elementwise exp operator.";
 
   let description = [{
@@ -1407,7 +1473,7 @@ def Tosa_ExpOp : Tosa_ElementwiseUnaryOp<"exp"> {
 //===----------------------------------------------------------------------===//
 // Operator: floor
 //===----------------------------------------------------------------------===//
-def Tosa_FloorOp : Tosa_ElementwiseUnaryOp<"floor"> {
+def Tosa_FloorOp : Tosa_ElementwiseUnaryOp<"floor", [Pure]> {
   let summary = "Elementwise floor operator.";
 
   let description = [{
@@ -1433,7 +1499,7 @@ def Tosa_FloorOp : Tosa_ElementwiseUnaryOp<"floor"> {
 //===----------------------------------------------------------------------===//
 // Operator: log
 //===----------------------------------------------------------------------===//
-def Tosa_LogOp : Tosa_ElementwiseUnaryOp<"log"> {
+def Tosa_LogOp : Tosa_ElementwiseUnaryOp<"log", [Pure]> {
   let summary = "Elementwise log operator.";
 
   let description = [{
@@ -1459,7 +1525,7 @@ def Tosa_LogOp : Tosa_ElementwiseUnaryOp<"log"> {
 //===----------------------------------------------------------------------===//
 // Operator: logical_not
 //===----------------------------------------------------------------------===//
-def Tosa_LogicalNotOp : Tosa_ElementwiseUnaryOp<"logical_not"> {
+def Tosa_LogicalNotOp : Tosa_ElementwiseUnaryOp<"logical_not", [Pure]> {
   let summary = "Returns the truth value of NOT input1 element-wise.";
 
   let description = [{
@@ -1487,11 +1553,14 @@ def Tosa_LogicalNotOp : Tosa_ElementwiseUnaryOp<"logical_not"> {
 //===----------------------------------------------------------------------===//
 def Tosa_NegateOp : Tosa_InferShapedTypeOp<"negate", [
     TosaElementwiseOperator,
-    Pure]> {
+    NoMemoryEffect]> {
   let summary = "Elementwise negate operator.";
 
   let description = [{
     Elementwise negation operation.
+
+    This operation is not pure. Undefined behaviour may occur if the calculated
+    result underflows or overflows.
   }];
 
   let arguments = (ins
@@ -1528,7 +1597,7 @@ def Tosa_NegateOp : Tosa_InferShapedTypeOp<"negate", [
 //===----------------------------------------------------------------------===//
 // Operator: reciprocal
 //===----------------------------------------------------------------------===//
-def Tosa_ReciprocalOp : Tosa_ElementwiseUnaryOp<"reciprocal"> {
+def Tosa_ReciprocalOp : Tosa_ElementwiseUnaryOp<"reciprocal", [Pure]> {
   let summary = "Elementwise reciprocal operator.";
 
   let description = [{
@@ -1566,7 +1635,7 @@ def Tosa_ReciprocalOp : Tosa_ElementwiseUnaryOp<"reciprocal"> {
 //===----------------------------------------------------------------------===//
 // Operator: rsqrt
 //===----------------------------------------------------------------------===//
-def Tosa_RsqrtOp : Tosa_ElementwiseUnaryOp<"rsqrt"> {
+def Tosa_RsqrtOp : Tosa_ElementwiseUnaryOp<"rsqrt", [Pure]> {
   let summary = "Elementwise 1/sqrt operator.";
 
   let description = [{
@@ -1593,7 +1662,7 @@ def Tosa_RsqrtOp : Tosa_ElementwiseUnaryOp<"rsqrt"> {
 //===----------------------------------------------------------------------===//
 // Operator: sin
 //===----------------------------------------------------------------------===//
-def Tosa_SinOp : Tosa_ElementwiseUnaryOp<"sin"> {
+def Tosa_SinOp : Tosa_ElementwiseUnaryOp<"sin", [Pure]> {
   let summary = "Elementwise sin operator.";
 
   let description = [{
@@ -1623,7 +1692,7 @@ def Tosa_SinOp : Tosa_ElementwiseUnaryOp<"sin"> {
 //===----------------------------------------------------------------------===//
 // Operator: select
 //===----------------------------------------------------------------------===//
-def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> {
+def Tosa_SelectOp : Tosa_ElementwiseOp<"select", [Pure]> {
   let summary = "Elementwise select operator.";
 
   let description = [{
@@ -1674,7 +1743,8 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> {
 def Tosa_EqualOp : Tosa_ElementwiseOp<"equal", [
     InferTensorType,
     Commutative,
-    SameOperandsElementType]> {
+    SameOperandsElementType,
+    Pure]> {
   let summary = "Returns the truth value of (input1 == input2) element-wise.";
 
   let description = [{
@@ -1709,7 +1779,8 @@ def Tosa_EqualOp : Tosa_ElementwiseOp<"equal", [
 //===----------------------------------------------------------------------===//
 // Operator: greater
 //===----------------------------------------------------------------------===//
-def Tosa_GreaterOp : Tosa_ElementwiseOp<"greater", [SameOperandsElementType]> {
+def Tosa_GreaterOp : Tosa_ElementwiseOp<"greater", [SameOperandsElementType,
+    Pure]> {
   let summary = "Returns the truth value of (input1 > input2) element-wise.";
 
   let description = [{
@@ -1739,7 +1810,7 @@ def Tosa_GreaterOp : Tosa_ElementwiseOp<"greater", [SameOperandsElementType]> {
 // Operator: greater_equal
 //===----------------------------------------------------------------------===//
 def Tosa_GreaterEqualOp : Tosa_ElementwiseOp<"greater_equal",
-    [SameOperandsElementType]> {
+    [SameOperandsElementType, Pure]> {
   let summary = "Returns the truth value of (input1 >= input2) element-wise.";
 
   let description = [{
@@ -1772,7 +1843,7 @@ def Tosa_GreaterEqualOp : Tosa_ElementwiseOp<"greater_equal",
 //===----------------------------------------------------------------------===//
 // Operator: reduce_all
 //===----------------------------------------------------------------------===//
-def Tosa_ReduceAllOp : Tosa_InferTensorTypeOp<"reduce_all"> {
+def Tosa_ReduceAllOp : Tosa_InferTensorTypeOp<"reduce_all", [Pure]> {
   let summary = "Reduce All operator.";
 
   let description = [{
@@ -1813,7 +1884,7 @@ def Tosa_ReduceAllOp : Tosa_InferTensorTypeOp<"reduce_all"> {
 //===----------------------------------------------------------------------===//
 // Operator: reduce_any
 //===----------------------------------------------------------------------===//
-def Tosa_ReduceAnyOp : Tosa_InferTensorTypeOp<"reduce_any"> {
+def Tosa_ReduceAnyOp : Tosa_InferTensorTypeOp<"reduce_any", [Pure]> {
   let summary = "Reduce Any operator.";
 
   let description = [{
@@ -1854,7 +1925,7 @@ def Tosa_ReduceAnyOp : Tosa_InferTensorTypeOp<"reduce_any"> {
 //===----------------------------------------------------------------------===//
 // Operator: reduce_max
 //===----------------------------------------------------------------------===//
-def Tosa_ReduceMaxOp : Tosa_InferTensorTypeOp<"reduce_max"> {
+def Tosa_ReduceMaxOp : Tosa_InferTensorTypeOp<"reduce_max", [Pure]> {
   let summary = "Reduce Max operator.";
 
   let description = [{
@@ -1896,7 +1967,7 @@ def Tosa_ReduceMaxOp : Tosa_InferTensorTypeOp<"reduce_max"> {
 //===----------------------------------------------------------------------===//
 // Operator: reduce_min
 //===----------------------------------------------------------------------===//
-def Tosa_ReduceMinOp : Tosa_InferTensorTypeOp<"reduce_min"> {
+def Tosa_ReduceMinOp : Tosa_InferTensorTypeOp<"reduce_min", [Pure]> {
   let summary = "Reduce Min operator.";
 
   let description = [{
@@ -1936,13 +2007,16 @@ def Tosa_ReduceMinOp : Tosa_InferTensorTypeOp<"reduce_min"> {
 }
 
 //===----------------------------------------------------------------------===//
-// Operator: reduce_prod
+// Operator: reduce_product
 //===----------------------------------------------------------------------===//
-def Tosa_ReduceProductOp : Tosa_InferTensorTypeOp<"reduce_product"> {
+def Tosa_ReduceProductOp : Tosa_InferTensorTypeOp<"reduce_product", [NoMemoryEffect]> {
   let summary = "Reduce Product operator.";
 
   let description = [{
     Reduce a tensor along the given axis by computing the product of the axis.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -1979,11 +2053,14 @@ def Tosa_ReduceProductOp : Tosa_InferTensorTypeOp<"reduce_product"> {
 //===----------------------------------------------------------------------===//
 // Operator: reduce_sum
 //===----------------------------------------------------------------------===//
-def Tosa_ReduceSumOp : Tosa_InferTensorTypeOp<"reduce_sum"> {
+def Tosa_ReduceSumOp : Tosa_InferTensorTypeOp<"reduce_sum", [NoMemoryEffect]> {
   let summary = "Reduce Sum operator.";
 
   let description = [{
     Reduce a tensor along the given axis by computing the sum of the axis.
+
+    This operation is not pure. Undefined behaviour may occur if the accumulated
+    result overflows.
   }];
 
   let arguments = (ins
@@ -2024,7 +2101,7 @@ def Tosa_ReduceSumOp : Tosa_InferTensorTypeOp<"reduce_sum"> {
 //===----------------------------------------------------------------------===//
 // Operator: concat
 //===----------------------------------------------------------------------===//
-def Tosa_ConcatOp : Tosa_InferTensorTypeOp<"concat"> {
+def Tosa_ConcatOp : Tosa_InferTensorTypeOp<"concat", [Pure]> {
   let summary = "Concatenates tensors along one dimension.";
 
   let description = [{
@@ -2062,7 +2139,7 @@ def Tosa_ConcatOp : Tosa_InferTensorTypeOp<"concat"> {
 //===----------------------------------------------------------------------===//
 // Operator: pad
 //===----------------------------------------------------------------------===//
-def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> {
+def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad", [Pure]> {
   let summary = "Pads a tensor with value specified.";
 
   let description = [{
@@ -2114,7 +2191,7 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> {
 //===----------------------------------------------------------------------===//
 // Operator: reshape
 //===----------------------------------------------------------------------===//
-def Tosa_ReshapeOp : Tosa_InferTensorTypeOp<"reshape"> {
+def Tosa_ReshapeOp : Tosa_InferTensorTypeOp<"reshape", [Pure]> {
   let summary = "Reshape operator.";
 
   let description = [{
@@ -2186,7 +2263,7 @@ def Tosa_ReverseOp: Tosa_Op<"reverse", [
 //===----------------------------------------------------------------------===//
 // Operator: slice
 //===----------------------------------------------------------------------===//
-def Tosa_SliceOp : Tosa_InferShapedTypeOp<"slice"> {
+def Tosa_SliceOp : Tosa_InferShapedTypeOp<"slice", [Pure]> {
   let summary = "Slice operator.";
 
   let description = [{
@@ -2221,7 +2298,7 @@ def Tosa_SliceOp : Tosa_InferShapedTypeOp<"slice"> {
 //===----------------------------------------------------------------------===//
 // Operator: tile
 //===----------------------------------------------------------------------===//
-def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile"> {
+def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile", [Pure]> {
   let summary = "Tile operator.";
 
   let description = [{
@@ -2258,7 +2335,8 @@ def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile"> {
 def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
                 [DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface ,
                                            ["reifyResultShapes"]>,
-                 AllElementTypesMatch<["input1", "output"]>]> {
+                 AllElementTypesMatch<["input1", "output"]>,
+                 Pure]> {
   let summary = "Transpose operator.";
 
   let description = [{
@@ -2296,7 +2374,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
 //===----------------------------------------------------------------------===//
 // Operator: gather
 //===----------------------------------------------------------------------===//
-def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather"> {
+def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather", [NoMemoryEffect]> {
   let summary = "Gather operation.";
 
   let description = [{
@@ -2304,6 +2382,9 @@ def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather"> {
     values tensor based on the indices. N is the number of batches, W the number
     of indices in each batch, K the range of each index and C the number data
     channels for each index.
+
+    This operation is not pure. Undefined behaviour may occur if the specified
+    indices are out of range.
   }];
 
   let arguments = (ins
@@ -2329,7 +2410,7 @@ def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather"> {
 //===----------------------------------------------------------------------===//
 // Operator: scatter
 //===----------------------------------------------------------------------===//
-def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter"> {
+def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter", [NoMemoryEffect]> {
   let summary = "Scatter operation.";
 
   let description = [{
@@ -2341,6 +2422,9 @@ def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter"> {
     single SCATTER operation and so each output index occurs at most once. It
     follows that K >= W. In use cases that require multiple updates to the same
     output position, these must be decomposed into multiple SCATTER operations.
+
+    This operation is not pure. Undefined behaviour may occur if the specified
+    indices are out of range or duplicate indices are provided.
   }];
 
   let arguments = (ins
@@ -2371,7 +2455,7 @@ def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter"> {
 //===----------------------------------------------------------------------===//
 // Operator: resize
 //===----------------------------------------------------------------------===//
-def Tosa_ResizeOp : Tosa_InferShapedTypeOp<"resize"> {
+def Tosa_ResizeOp : Tosa_InferShapedTypeOp<"resize", [Pure]> {
   let summary = "Resize operation, supports various resize/upsample modes.";
 
   let description = [{
@@ -2513,7 +2597,7 @@ def Tosa_CastOp: Tosa_Op<"cast", [Pure, SameOperandsAndResultShape,
 //===----------------------------------------------------------------------===//
 // Operator: cast_from_block_scaled
 //===----------------------------------------------------------------------===//
-def Tosa_CastFromBlockScaledOp: Tosa_InferShapedTypeOp<"cast_from_block_scaled"> {
+def Tosa_CastFromBlockScaledOp: Tosa_InferShapedTypeOp<"cast_from_block_scaled", [Pure]> {
   let summary = "Apply scales from a scale tensor to the values in a value tensor";
 
   let description = [{
@@ -2544,7 +2628,7 @@ def Tosa_CastFromBlockScaledOp: Tosa_InferShapedTypeOp<"cast_from_block_scaled">
 //===----------------------------------------------------------------------===//
 // Operator: cast_to_block_scaled
 //===----------------------------------------------------------------------===//
-def Tosa_CastToBlockScaledOp : Tosa_InferShapedTypeOp<"cast_to_block_scaled"> {
+def Tosa_CastToBlockScaledOp : Tosa_InferShapedTypeOp<"cast_to_block_scaled", [Pure]> {
   let summary = "Calculate scale tensor values per block, output to separate scale and data tensors.";
 
   let description = [{
@@ -2576,7 +2660,7 @@ def Tosa_CastToBlockScaledOp : Tosa_InferShapedTypeOp<"cast_to_block_scaled"> {
 //===----------------------------------------------------------------------===//
 // Operator: rescale
 //===----------------------------------------------------------------------===//
-def Tosa_RescaleOp : Tosa_InferShapedTypeOp<"rescale"> {
+def Tosa_RescaleOp : Tosa_InferShapedTypeOp<"rescale", [NoMemoryEffect]> {
   let summary = "Tosa rescale operator.";
 
   let description = [{
@@ -2592,6 +2676,9 @@ def Tosa_RescaleOp : Tosa_InferShapedTypeOp<"rescale"> {
     The shift and value range are limited to allow a variety of implementations. The limit
     of 62 on shift allows the shift to be decomposed as two right shifts of 31.
 
+    This operation is not pure. Undefined behaviour may occur if the calculated
+    result underflows or overflows.
+
     Supported rescalings:
     * This table is showing the supported conversions from the TOSA Specification.
     * The MLIR dialect here can be used to represent other conversions.
diff --git a/mlir/include/mlir/IR/BuiltinDialectBytecode.td b/mlir/include/mlir/IR/BuiltinDialectBytecode.td
index 53a859e32d64b..64cc8a8ff5e20 100644
--- a/mlir/include/mlir/IR/BuiltinDialectBytecode.td
+++ b/mlir/include/mlir/IR/BuiltinDialectBytecode.td
@@ -296,6 +296,9 @@ def VectorType : DialectType<(type
   Type:$elementType
 )> {
   let printerPredicate = "!$_val.isScalable()";
+  // Use getChecked to produce a null type (and emit a diagnostic) instead of
+  // asserting when the element type does not implement VectorElementTypeInterface.
+  let cBuilder = "VectorType::getChecked([&]() { return reader.emitError(\"invalid vector type\"); }, shape, elementType)";
 }
 
 def VectorTypeWithScalableDims : DialectType<(type
@@ -305,7 +308,9 @@ def VectorTypeWithScalableDims : DialectType<(type
 )> {
   let printerPredicate = "$_val.isScalable()";
   // Note: order of serialization does not match order of builder.
-  let cBuilder = "get<$_resultType>(context, shape, elementType, scalableDims)";
+  // Use getChecked to produce a null type (and emit a diagnostic) instead of
+  // asserting when the element type does not implement VectorElementTypeInterface.
+  let cBuilder = "VectorType::getChecked([&]() { return reader.emitError(\"invalid vector type\"); }, shape, elementType, scalableDims)";
 }
 }
 
diff --git a/mlir/include/mlir/IR/ExtensibleDialect.h b/mlir/include/mlir/IR/ExtensibleDialect.h
index e52306f79378a..dcbf6813506d5 100644
--- a/mlir/include/mlir/IR/ExtensibleDialect.h
+++ b/mlir/include/mlir/IR/ExtensibleDialect.h
@@ -566,7 +566,7 @@ class DynamicOpDefinition : public OperationName::Impl {
   Attribute getPropertiesAsAttr(Operation *op) final { return {}; }
   void copyProperties(OpaqueProperties lhs, OpaqueProperties rhs) final {}
   bool compareProperties(OpaqueProperties, OpaqueProperties) final {
-    return false;
+    return true;
   }
   llvm::hash_code hashProperties(OpaqueProperties prop) final { return {}; }
 
diff --git a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
index 8975b1235a7e3..06fa724e05fab 100644
--- a/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
+++ b/mlir/include/mlir/Interfaces/ControlFlowInterfaces.td
@@ -432,7 +432,11 @@ def RegionBranchTerminatorOpInterface :
         passing them to the region successor indicated by `point`.
       }],
       "::mlir::MutableOperandRange", "getMutableSuccessorOperands",
-      (ins "::mlir::RegionSuccessor":$point)
+      (ins "::mlir::RegionSuccessor":$point),
+      [{}],
+      /*defaultImplementation=*/[{
+        return ::mlir::MutableOperandRange($_op);
+      }]
     >,
     InterfaceMethod<[{
         Returns the potential region successors that are branched to after this
@@ -632,17 +636,6 @@ def WeightedRegionBranchOpInterface
 // Op is "return-like".
 def ReturnLike : TraitList<[
     DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>,
-    NativeOpTrait<
-        /*name=*/"ReturnLike",
-        /*traits=*/[],
-        /*extraOpDeclaration=*/"",
-        /*extraOpDefinition=*/[{
-          ::mlir::MutableOperandRange $cppClass::getMutableSuccessorOperands(
-            ::mlir::RegionSuccessor successor) {
-            return ::mlir::MutableOperandRange(*this);
-          }
-        }]
-    >
-]>;
+    NativeOpTrait</*name=*/"ReturnLike">]>;
 
 #endif // MLIR_INTERFACES_CONTROLFLOWINTERFACES
diff --git a/mlir/lib/Analysis/Presburger/Matrix.cpp b/mlir/lib/Analysis/Presburger/Matrix.cpp
index 83a2c280c3d4e..6ea04543146b7 100644
--- a/mlir/lib/Analysis/Presburger/Matrix.cpp
+++ b/mlir/lib/Analysis/Presburger/Matrix.cpp
@@ -284,6 +284,24 @@ void Matrix<T>::moveColumns(unsigned srcPos, unsigned num, unsigned dstPos) {
   }
 }
 
+template <typename T>
+Matrix<T> Matrix<T>::postMultiply(const Matrix<T> &other) const {
+  assert(getNumColumns() == other.getNumRows());
+  unsigned n = getNumRows();
+  unsigned m = other.getNumRows();
+  unsigned p = other.getNumColumns();
+  Matrix<T> result(n, p);
+
+  for (unsigned i = 0; i < n; i++) {
+    for (unsigned j = 0; j < m; j++) {
+      for (unsigned k = 0; k < p; k++) {
+        result.at(i, k) += at(i, j) * other.at(j, k);
+      }
+    }
+  }
+  return result;
+}
+
 template <typename T>
 void Matrix<T>::addToRow(unsigned sourceRow, unsigned targetRow,
                          const T &scale) {
@@ -535,6 +553,151 @@ std::pair<IntMatrix, IntMatrix> IntMatrix::computeHermiteNormalForm() const {
   return {h, u};
 }
 
+// In the submatrix `mat(from:, from:)`, the function finds the position (row,
+// col) of the element with smallest non-zero absolute value. When all elements
+// in the submatrix are zero, returns std::nullopt.
+static std::optional<std::pair<unsigned, unsigned>>
+findNonZeroMinInSubmatrix(const IntMatrix &mat, unsigned from) {
+  unsigned numRows = mat.getNumRows();
+  unsigned numCols = mat.getNumColumns();
+  unsigned minRow = from, minCol = from;
+
+  std::optional<DynamicAPInt> minVal;
+  for (unsigned r = from; r < numRows; r++) {
+    for (unsigned c = from; c < numCols; c++) {
+      DynamicAPInt val = llvm::abs(mat(r, c));
+      if (val == 0 || (minVal && val >= *minVal))
+        continue;
+
+      minVal = val;
+      minRow = r;
+      minCol = c;
+    }
+  }
+
+  if (!minVal)
+    return std::nullopt;
+
+  return std::make_pair(minRow, minCol);
+}
+
+// Finds the first row in submatrix `mat(from:, from:)` that contains an element
+// `d` such that `d` is not a multiple of `divisor`. When there is no such row,
+// returns std::nullopt.
+static std::optional<unsigned> findNonMultipleRow(const IntMatrix &mat,
+                                                  unsigned from,
+                                                  const DynamicAPInt &divisor) {
+  unsigned numRows = mat.getNumRows();
+  unsigned numCols = mat.getNumColumns();
+  for (unsigned row = from; row < numRows; ++row) {
+    for (unsigned col = from; col < numCols; ++col) {
+      if (mat(row, col) % divisor != 0)
+        return row;
+    }
+  }
+  return std::nullopt;
+}
+
+std::tuple<IntMatrix, IntMatrix, IntMatrix>
+IntMatrix::computeSmithNormalForm() const {
+  IntMatrix d = *this;
+  // We put D into diagonal form by applying row and columns operations to it.
+  // The matrix U records row operations applied in the process, and V records
+  // column operations.
+  IntMatrix u = IntMatrix::identity(d.getNumRows());
+  IntMatrix v = IntMatrix::identity(d.getNumColumns());
+
+  unsigned numRows = d.getNumRows();
+  unsigned numCols = d.getNumColumns();
+  for (unsigned i = 0, e = std::min(numRows, numCols); i < e; i++) {
+    // We first put D into diagonal form, and then ensure the divisibility
+    // condition. The latter step is better illustrated with an example:
+    //
+    // [6 0 ] ---(1)--> [6 10] ---(2)--> [2 0 ]
+    // [0 10]           [0 10]           [0 10]
+    //
+    // (1) adds the element violating the divisibility constraint to the same
+    // column in row i;
+    // (2) does an elimination of the column.
+    //
+    // There can be many elements that violate the constraint, hence the loop.
+    bool changed;
+    do {
+      changed = false;
+
+      // Find the entry in the submatrix d(i:, i:) with the smallest non-zero
+      // absolute value.
+      // The element is the pivot, and we record its current row and column.
+      auto pivotPos = findNonZeroMinInSubmatrix(d, i);
+      if (!pivotPos)
+        break;
+      auto [pvtRow, pvtCol] = *pivotPos;
+
+      // The remaining submatrix is zero.
+      if (d(pvtRow, pvtCol) == 0)
+        break;
+
+      // Bring pivot to d(i, i). Record the operation in u, v respectively.
+      if (pvtRow != i) {
+        d.swapRows(pvtRow, i);
+        u.swapRows(pvtRow, i);
+      }
+      if (pvtCol != i) {
+        d.swapColumns(pvtCol, i);
+        v.swapColumns(pvtCol, i);
+      }
+
+      // Ensure the pivot is positive.
+      if (d(i, i) < 0) {
+        d.negateRow(i);
+        u.negateRow(i);
+      }
+
+      DynamicAPInt pivot = d(i, i);
+
+      // Clear other entries in row i and column i with Euclid's algorithm.
+      for (unsigned r = i + 1; r < numRows; ++r) {
+        while (d(r, i) != 0) {
+          DynamicAPInt quotient = d(r, i) / d(i, i);
+          d.addToRow(i, r, -quotient);
+          u.addToRow(i, r, -quotient);
+
+          if (d(r, i) != 0) {
+            d.swapRows(r, i);
+            u.swapRows(r, i);
+            changed = true;
+          }
+        }
+      }
+      // Similar to the rows operations, this time it works on columns.
+      for (unsigned c = i + 1; c < numCols; ++c) {
+        while (d(i, c) != 0) {
+          DynamicAPInt quotient = d(i, c) / d(i, i);
+          d.addToColumn(i, c, -quotient);
+          v.addToColumn(i, c, -quotient);
+
+          if (d(i, c) != 0) {
+            d.swapColumns(c, i);
+            v.swapColumns(c, i);
+            changed = true;
+          }
+        }
+      }
+
+      if (auto row = findNonMultipleRow(d, i + 1, pivot)) {
+        // Add the row (r) to row i. This brings d(r, c) into the i-th row,
+        // creating a new value at d(i, c) that will be used to reduce the
+        // pivot size.
+        d.addToRow(*row, i, 1);
+        u.addToRow(*row, i, 1);
+        changed = true;
+      }
+    } while (changed);
+  }
+
+  return {u, d, v};
+}
+
 DynamicAPInt IntMatrix::normalizeRow(unsigned row, unsigned cols) {
   return normalizeRange(getRow(row).slice(0, cols));
 }
diff --git a/mlir/lib/Bindings/Python/DialectLLVM.cpp b/mlir/lib/Bindings/Python/DialectLLVM.cpp
index 5c79f515c49eb..7e4f24b556613 100644
--- a/mlir/lib/Bindings/Python/DialectLLVM.cpp
+++ b/mlir/lib/Bindings/Python/DialectLLVM.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include <string>
+#include <vector>
 
 #include "mlir-c/Dialect/LLVM.h"
 #include "mlir-c/IR.h"
@@ -222,10 +223,175 @@ struct PointerType : PyConcreteType<PointerType> {
   }
 };
 
+//===--------------------------------------------------------------------===//
+// FunctionType
+//===--------------------------------------------------------------------===//
+
+struct FunctionType : PyConcreteType<FunctionType> {
+  static constexpr IsAFunctionTy isaFunction = mlirTypeIsALLVMFunctionType;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirLLVMFunctionTypeGetTypeID;
+  static constexpr const char *pyClassName = "FunctionType";
+  static inline const MlirStringRef name = mlirLLVMFunctionTypeGetName();
+  using Base::Base;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](PyType &resultType, const std::vector<PyType> &argumentTypes,
+           bool isVarArg) {
+          std::vector<MlirType> argTypes(argumentTypes.size());
+          std::copy(argumentTypes.begin(), argumentTypes.end(),
+                    argTypes.begin());
+          return FunctionType(
+              resultType.getContext(),
+              mlirLLVMFunctionTypeGet(resultType, argTypes.size(),
+                                      argTypes.data(), isVarArg));
+        },
+        "result_type"_a, "argument_types"_a, nb::kw_only(),
+        "is_var_arg"_a = false);
+    c.def_prop_ro("return_type", [](const FunctionType &type) {
+      return mlirLLVMFunctionTypeGetReturnType(type);
+    });
+    c.def_prop_ro("num_inputs", [](const FunctionType &type) {
+      return mlirLLVMFunctionTypeGetNumInputs(type);
+    });
+    c.def_prop_ro("inputs", [](const FunctionType &type) {
+      nb::list inputs;
+      for (intptr_t i = 0, e = mlirLLVMFunctionTypeGetNumInputs(type); i < e;
+           ++i) {
+        inputs.append(mlirLLVMFunctionTypeGetInput(type, i));
+      }
+      return inputs;
+    });
+    c.def_prop_ro("is_var_arg", [](const FunctionType &type) {
+      return mlirLLVMFunctionTypeIsVarArg(type);
+    });
+  }
+};
+
+//===--------------------------------------------------------------------===//
+// Metadata Attributes
+//===--------------------------------------------------------------------===//
+
+struct MDStringAttr : PyConcreteAttribute<MDStringAttr> {
+  static constexpr IsAFunctionTy isaFunction = mlirLLVMAttrIsAMDStringAttr;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirLLVMMDStringAttrGetTypeID;
+  static constexpr const char *pyClassName = "MDStringAttr";
+  using Base::Base;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](const std::string &value, DefaultingPyMlirContext context) {
+          return MDStringAttr(
+              context->getRef(),
+              mlirLLVMMDStringAttrGet(
+                  context.get()->get(),
+                  mlirStringRefCreate(value.data(), value.size())));
+        },
+        "value"_a, nb::kw_only(), "context"_a = nb::none());
+    c.def_prop_ro("value", [](const MDStringAttr &self) {
+      MlirStringRef ref = mlirLLVMMDStringAttrGetValue(self);
+      return nb::str(ref.data, ref.length);
+    });
+  }
+};
+
+struct MDConstantAttr : PyConcreteAttribute<MDConstantAttr> {
+  static constexpr IsAFunctionTy isaFunction = mlirLLVMAttrIsAMDConstantAttr;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirLLVMMDConstantAttrGetTypeID;
+  static constexpr const char *pyClassName = "MDConstantAttr";
+  using Base::Base;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](PyAttribute &valueAttr, DefaultingPyMlirContext context) {
+          return MDConstantAttr(
+              context->getRef(),
+              mlirLLVMMDConstantAttrGet(context.get()->get(), valueAttr));
+        },
+        "value"_a, nb::kw_only(), "context"_a = nb::none());
+    c.def_prop_ro("value", [](const MDConstantAttr &self) {
+      return mlirLLVMMDConstantAttrGetValue(self);
+    });
+  }
+};
+
+struct MDFuncAttr : PyConcreteAttribute<MDFuncAttr> {
+  static constexpr IsAFunctionTy isaFunction = mlirLLVMAttrIsAMDFuncAttr;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirLLVMMDFuncAttrGetTypeID;
+  static constexpr const char *pyClassName = "MDFuncAttr";
+  using Base::Base;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](const std::string &name, DefaultingPyMlirContext context) {
+          MlirAttribute symRef = mlirFlatSymbolRefAttrGet(
+              context.get()->get(),
+              mlirStringRefCreate(name.data(), name.size()));
+          return MDFuncAttr(
+              context->getRef(),
+              mlirLLVMMDFuncAttrGet(context.get()->get(), symRef));
+        },
+        "name"_a, nb::kw_only(), "context"_a = nb::none());
+    c.def_prop_ro("name", [](const MDFuncAttr &self) {
+      MlirAttribute symRef = mlirLLVMMDFuncAttrGetName(self);
+      MlirStringRef ref = mlirFlatSymbolRefAttrGetValue(symRef);
+      return nb::str(ref.data, ref.length);
+    });
+  }
+};
+
+struct MDNodeAttr : PyConcreteAttribute<MDNodeAttr> {
+  static constexpr IsAFunctionTy isaFunction = mlirLLVMAttrIsAMDNodeAttr;
+  static constexpr GetTypeIDFunctionTy getTypeIdFunction =
+      mlirLLVMMDNodeAttrGetTypeID;
+  static constexpr const char *pyClassName = "MDNodeAttr";
+  using Base::Base;
+
+  static void bindDerived(ClassTy &c) {
+    c.def_static(
+        "get",
+        [](const std::vector<PyAttribute> &operands,
+           DefaultingPyMlirContext context) {
+          std::vector<MlirAttribute> operands_(operands.size());
+          std::copy(operands.begin(), operands.end(), operands_.begin());
+          return MDNodeAttr(context->getRef(),
+                            mlirLLVMMDNodeAttrGet(context.get()->get(),
+                                                  operands_.size(),
+                                                  operands_.data()));
+        },
+        "operands"_a, nb::kw_only(), "context"_a = nb::none());
+    c.def_prop_ro("num_operands", [](const MDNodeAttr &self) {
+      return mlirLLVMMDNodeAttrGetNumOperands(self);
+    });
+    c.def("__getitem__", [](const MDNodeAttr &self, intptr_t index) {
+      intptr_t n = mlirLLVMMDNodeAttrGetNumOperands(self);
+      if (index < 0 || index >= n)
+        throw nb::index_error("MDNodeAttr operand index out of range");
+      return mlirLLVMMDNodeAttrGetOperand(self, index);
+    });
+    c.def("__len__", [](const MDNodeAttr &self) {
+      return mlirLLVMMDNodeAttrGetNumOperands(self);
+    });
+  }
+};
+
 static void populateDialectLLVMSubmodule(nanobind::module_ &m) {
   StructType::bind(m);
   ArrayType::bind(m);
   PointerType::bind(m);
+  FunctionType::bind(m);
+  MDStringAttr::bind(m);
+  MDConstantAttr::bind(m);
+  MDFuncAttr::bind(m);
+  MDNodeAttr::bind(m);
 
   m.def(
       "translate_module_to_llvmir",
diff --git a/mlir/lib/Bindings/Python/Globals.cpp b/mlir/lib/Bindings/Python/Globals.cpp
index 411b8a6705f1c..82195acb9f4fb 100644
--- a/mlir/lib/Bindings/Python/Globals.cpp
+++ b/mlir/lib/Bindings/Python/Globals.cpp
@@ -130,10 +130,10 @@ void PyGlobals::registerValueCaster(MlirTypeID mlirTypeID,
 }
 
 void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
-                                    nb::object pyClass) {
+                                    nb::object pyClass, bool replace) {
   nb::ft_lock_guard lock(mutex);
   nb::object &found = dialectClassMap[dialectNamespace];
-  if (found) {
+  if (found && !replace) {
     throw std::runtime_error(nanobind::detail::join(
         "Dialect namespace '", dialectNamespace, "' is already registered."));
   }
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 7eb59d61b0d57..3d07e364b5c98 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -2860,7 +2860,8 @@ void populateRoot(nb::module_ &m) {
           },
           "dialect_namespace"_a)
       .def("_register_dialect_impl", &PyGlobals::registerDialectImpl,
-           "dialect_namespace"_a, "dialect_class"_a,
+           "dialect_namespace"_a, "dialect_class"_a, nb::kw_only(),
+           "replace"_a = false,
            "Testing hook for directly registering a dialect")
       .def("_register_operation_impl", &PyGlobals::registerOperationImpl,
            "operation_name"_a, "operation_class"_a, nb::kw_only(),
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index eb30f169e4289..154ebbc51e961 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -85,6 +85,14 @@ MlirStringRef mlirLLVMFunctionTypeGetName(void) {
   return wrap(LLVMFunctionType::name);
 }
 
+bool mlirTypeIsALLVMFunctionType(MlirType type) {
+  return isa<LLVM::LLVMFunctionType>(unwrap(type));
+}
+
+MlirTypeID mlirLLVMFunctionTypeGetTypeID(void) {
+  return wrap(LLVM::LLVMFunctionType::getTypeID());
+}
+
 intptr_t mlirLLVMFunctionTypeGetNumInputs(MlirType type) {
   return llvm::cast<LLVM::LLVMFunctionType>(unwrap(type)).getNumParams();
 }
@@ -99,6 +107,10 @@ MlirType mlirLLVMFunctionTypeGetReturnType(MlirType type) {
   return wrap(llvm::cast<LLVM::LLVMFunctionType>(unwrap(type)).getReturnType());
 }
 
+bool mlirLLVMFunctionTypeIsVarArg(MlirType type) {
+  return llvm::cast<LLVM::LLVMFunctionType>(unwrap(type)).isVarArg();
+}
+
 bool mlirTypeIsALLVMStructType(MlirType type) {
   return isa<LLVM::LLVMStructType>(unwrap(type));
 }
@@ -523,3 +535,82 @@ MlirAttribute mlirLLVMDIAnnotationAttrGet(MlirContext ctx, MlirAttribute name,
 MlirStringRef mlirLLVMDIAnnotationAttrGetName(void) {
   return wrap(DIAnnotationAttr::name);
 }
+
+//===----------------------------------------------------------------------===//
+// Metadata Attributes
+//===----------------------------------------------------------------------===//
+
+MlirAttribute mlirLLVMMDStringAttrGet(MlirContext ctx, MlirStringRef value) {
+  return wrap(MDStringAttr::get(unwrap(ctx),
+                                StringAttr::get(unwrap(ctx), unwrap(value))));
+}
+
+bool mlirLLVMAttrIsAMDStringAttr(MlirAttribute attr) {
+  return isa<MDStringAttr>(unwrap(attr));
+}
+
+MlirTypeID mlirLLVMMDStringAttrGetTypeID(void) {
+  return wrap(MDStringAttr::getTypeID());
+}
+
+MlirStringRef mlirLLVMMDStringAttrGetValue(MlirAttribute attr) {
+  return wrap(cast<MDStringAttr>(unwrap(attr)).getValue().getValue());
+}
+
+MlirAttribute mlirLLVMMDConstantAttrGet(MlirContext ctx,
+                                        MlirAttribute valueAttr) {
+  return wrap(MDConstantAttr::get(unwrap(ctx), unwrap(valueAttr)));
+}
+
+bool mlirLLVMAttrIsAMDConstantAttr(MlirAttribute attr) {
+  return isa<MDConstantAttr>(unwrap(attr));
+}
+
+MlirTypeID mlirLLVMMDConstantAttrGetTypeID(void) {
+  return wrap(MDConstantAttr::getTypeID());
+}
+
+MlirAttribute mlirLLVMMDConstantAttrGetValue(MlirAttribute attr) {
+  return wrap((Attribute)cast<MDConstantAttr>(unwrap(attr)).getValue());
+}
+
+MlirAttribute mlirLLVMMDFuncAttrGet(MlirContext ctx, MlirAttribute name) {
+  return wrap(
+      MDFuncAttr::get(unwrap(ctx), cast<FlatSymbolRefAttr>(unwrap(name))));
+}
+
+bool mlirLLVMAttrIsAMDFuncAttr(MlirAttribute attr) {
+  return isa<MDFuncAttr>(unwrap(attr));
+}
+
+MlirTypeID mlirLLVMMDFuncAttrGetTypeID(void) {
+  return wrap(MDFuncAttr::getTypeID());
+}
+
+MlirAttribute mlirLLVMMDFuncAttrGetName(MlirAttribute attr) {
+  return wrap((Attribute)cast<MDFuncAttr>(unwrap(attr)).getName());
+}
+
+MlirAttribute mlirLLVMMDNodeAttrGet(MlirContext ctx, intptr_t nOperands,
+                                    MlirAttribute const *operands) {
+  SmallVector<Attribute> attrStorage;
+  attrStorage.reserve(nOperands);
+  return wrap(MDNodeAttr::get(unwrap(ctx),
+                              unwrapList(nOperands, operands, attrStorage)));
+}
+
+bool mlirLLVMAttrIsAMDNodeAttr(MlirAttribute attr) {
+  return isa<MDNodeAttr>(unwrap(attr));
+}
+
+MlirTypeID mlirLLVMMDNodeAttrGetTypeID(void) {
+  return wrap(MDNodeAttr::getTypeID());
+}
+
+intptr_t mlirLLVMMDNodeAttrGetNumOperands(MlirAttribute attr) {
+  return cast<MDNodeAttr>(unwrap(attr)).getOperands().size();
+}
+
+MlirAttribute mlirLLVMMDNodeAttrGetOperand(MlirAttribute attr, intptr_t index) {
+  return wrap(cast<MDNodeAttr>(unwrap(attr)).getOperands()[index]);
+}
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 7fdc23adc8573..d90912f9f686f 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -154,6 +154,9 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
   // discarded on lowering to LLVM-IR from the OpenMP dialect.
   converter.addConversion(
       [&](omp::MapBoundsType type) -> Type { return type; });
+  converter.addConversion(
+      [&](omp::AffinityEntryType type) -> Type { return type; });
+  converter.addConversion([&](omp::IteratedType type) -> Type { return type; });
 
   // Add conversions for all OpenMP operations.
   addOpenMPOpConversions<
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 9d81702581131..815909169c6b8 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -290,6 +290,7 @@ class VectorGatherOpConversion
     MemRefType memRefType = dyn_cast<MemRefType>(gather.getBaseType());
     assert(memRefType && "The base should be bufferized");
 
+    // TODO: Add support for strided MemRef.
     if (failed(isMemRefTypeSupported(memRefType, *this->getTypeConverter())))
       return rewriter.notifyMatchFailure(gather, "memref type not supported");
 
@@ -348,6 +349,7 @@ class VectorScatterOpConversion
     auto memRefType = dyn_cast<MemRefType>(scatter.getBaseType());
     assert(memRefType && "The base should be bufferized");
 
+    // TODO: Add support for strided MemRef.
     if (failed(isMemRefTypeSupported(memRefType, *this->getTypeConverter())))
       return rewriter.notifyMatchFailure(scatter, "memref type not supported");
 
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
index d018cddeb8dc1..b6e101952676a 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateUnsupportedFloats.cpp
@@ -132,25 +132,23 @@ void EmulateUnsupportedFloatsPass::runOnOperation() {
   SmallVector<Type> sourceTypes;
   Type targetType;
 
-  std::optional<FloatType> maybeTargetType =
-      arith::parseFloatType(ctx, targetTypeStr);
-  if (!maybeTargetType) {
+  FloatType parsedTargetType = arith::parseFloatType(ctx, targetTypeStr);
+  if (!parsedTargetType) {
     emitError(UnknownLoc::get(ctx), "could not map target type '" +
                                         targetTypeStr +
                                         "' to a known floating-point type");
     return signalPassFailure();
   }
-  targetType = *maybeTargetType;
+  targetType = parsedTargetType;
   for (StringRef sourceTypeStr : sourceTypeStrs) {
-    std::optional<FloatType> maybeSourceType =
-        arith::parseFloatType(ctx, sourceTypeStr);
-    if (!maybeSourceType) {
+    FloatType sourceType = arith::parseFloatType(ctx, sourceTypeStr);
+    if (!sourceType) {
       emitError(UnknownLoc::get(ctx), "could not map source type '" +
                                           sourceTypeStr +
                                           "' to a known floating-point type");
       return signalPassFailure();
     }
-    sourceTypes.push_back(*maybeSourceType);
+    sourceTypes.push_back(sourceType);
   }
   if (sourceTypes.empty())
     (void)emitOptionalWarning(
diff --git a/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt b/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt
index 07fa58b209b5e..b4760510fc96e 100644
--- a/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Arith/Utils/CMakeLists.txt
@@ -6,6 +6,7 @@ add_mlir_dialect_library(MLIRArithUtils
 
   LINK_LIBS PUBLIC
   MLIRArithDialect
+  MLIRAsmParser
   MLIRComplexDialect
   MLIRDialect
   MLIRDialectUtils
diff --git a/mlir/lib/Dialect/Arith/Utils/Utils.cpp b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
index 122154566a74e..106d125de14b0 100644
--- a/mlir/lib/Dialect/Arith/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
@@ -11,9 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/AsmParser/AsmParser.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/IR/Diagnostics.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVectorExtras.h"
 #include <numeric>
@@ -357,27 +359,15 @@ Value createProduct(OpBuilder &builder, Location loc, ArrayRef<Value> values,
   });
 }
 
-/// Map strings to float types.
-std::optional<FloatType> parseFloatType(MLIRContext *ctx, StringRef name) {
-  Builder b(ctx);
-  return llvm::StringSwitch<std::optional<FloatType>>(name)
-      .Case("f4E2M1FN", b.getType<Float4E2M1FNType>())
-      .Case("f6E2M3FN", b.getType<Float6E2M3FNType>())
-      .Case("f6E3M2FN", b.getType<Float6E3M2FNType>())
-      .Case("f8E5M2", b.getType<Float8E5M2Type>())
-      .Case("f8E4M3", b.getType<Float8E4M3Type>())
-      .Case("f8E4M3FN", b.getType<Float8E4M3FNType>())
-      .Case("f8E5M2FNUZ", b.getType<Float8E5M2FNUZType>())
-      .Case("f8E4M3FNUZ", b.getType<Float8E4M3FNUZType>())
-      .Case("f8E3M4", b.getType<Float8E3M4Type>())
-      .Case("f8E8M0FNU", b.getType<Float8E8M0FNUType>())
-      .Case("bf16", b.getType<BFloat16Type>())
-      .Case("f16", b.getType<Float16Type>())
-      .Case("f32", b.getType<Float32Type>())
-      .Case("f64", b.getType<Float64Type>())
-      .Case("f80", b.getType<Float80Type>())
-      .Case("f128", b.getType<Float128Type>())
-      .Default(std::nullopt);
+FloatType parseFloatType(MLIRContext *ctx, StringRef name) {
+  // Parsing non-builtin types is unsafe because the respective dialect may not
+  // have been loaded.
+  if (!name.empty() && name.front() == '!')
+    return FloatType();
+
+  // Suppress diagnostics: callers handle invalid type strings themselves.
+  ScopedDiagnosticHandler handler(ctx, [](Diagnostic &) {});
+  return dyn_cast_or_null<FloatType>(mlir::parseType(name, ctx));
 }
 
 } // namespace mlir::arith
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
index e43ab54a048b9..3aaa38272935d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/FuncBufferizableOpInterfaceImpl.cpp
@@ -101,12 +101,14 @@ static FuncOp getCalledFunction(CallOpInterface callOp,
 /// Return the FuncOp called by `callOp`.
 static FuncOp getCalledFunction(CallOpInterface callOp,
                                 const AnalysisState &state) {
-  auto &oneShotAnalysisState = static_cast<const OneShotAnalysisState &>(state);
-
-  if (auto *funcAnalysisState =
-          oneShotAnalysisState.getExtension<FuncAnalysisState>()) {
-    // Use the cached symbol tables.
-    return getCalledFunction(callOp, funcAnalysisState->symbolTables);
+  if (isa<OneShotAnalysisState>(state)) {
+    auto &oneShotAnalysisState =
+        static_cast<const OneShotAnalysisState &>(state);
+    if (auto *funcAnalysisState =
+            oneShotAnalysisState.getExtension<FuncAnalysisState>()) {
+      // Use the cached symbol tables.
+      return getCalledFunction(callOp, funcAnalysisState->symbolTables);
+    }
   }
 
   SymbolTableCollection symbolTables;
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 2680c7311924f..5d409f71847c6 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -409,83 +409,7 @@ LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
     return op->emitError("expected '")
            << getContainerModuleAttrName() << "' attribute to be attached to '"
            << ModuleOp::getOperationName() << '\'';
-
-  auto walkResult = module.walk([&module](LaunchFuncOp launchOp) -> WalkResult {
-    // Ignore launches that are nested more or less deep than functions in the
-    // module we are currently checking.
-    if (!launchOp->getParentOp() ||
-        launchOp->getParentOp()->getParentOp() != module)
-      return success();
-
-    // Ignore launch ops with missing attributes here. The errors will be
-    // reported by the verifiers of those ops.
-    if (!launchOp->getAttrOfType<SymbolRefAttr>(
-            LaunchFuncOp::getKernelAttrName(launchOp->getName())))
-      return success();
-
-    // Check that `launch_func` refers to a well-formed GPU kernel container.
-    StringAttr kernelContainerName = launchOp.getKernelModuleName();
-    Operation *kernelContainer = module.lookupSymbol(kernelContainerName);
-    if (!kernelContainer)
-      return launchOp.emitOpError()
-             << "kernel container '" << kernelContainerName.getValue()
-             << "' is undefined";
-
-    // If the container is a GPU binary op return success.
-    if (isa<BinaryOp>(kernelContainer))
-      return success();
-
-    auto kernelModule = dyn_cast<GPUModuleOp>(kernelContainer);
-    if (!kernelModule)
-      return launchOp.emitOpError()
-             << "kernel module '" << kernelContainerName.getValue()
-             << "' is undefined";
-
-    // Check that `launch_func` refers to a well-formed kernel function.
-    Operation *kernelFunc = module.lookupSymbol(launchOp.getKernelAttr());
-    if (!kernelFunc)
-      return launchOp.emitOpError("kernel function '")
-             << launchOp.getKernel() << "' is undefined";
-    auto kernelConvertedFunction = dyn_cast<FunctionOpInterface>(kernelFunc);
-    if (!kernelConvertedFunction) {
-      InFlightDiagnostic diag = launchOp.emitOpError()
-                                << "referenced kernel '" << launchOp.getKernel()
-                                << "' is not a function";
-      diag.attachNote(kernelFunc->getLoc()) << "see the kernel definition here";
-      return diag;
-    }
-
-    if (!kernelFunc->getAttrOfType<mlir::UnitAttr>(
-            GPUDialect::getKernelFuncAttrName()))
-      return launchOp.emitOpError("kernel function is missing the '")
-             << GPUDialect::getKernelFuncAttrName() << "' attribute";
-
-    // TODO: If the kernel isn't a GPU function (which happens during separate
-    // compilation), do not check type correspondence as it would require the
-    // verifier to be aware of the type conversion.
-    auto kernelGPUFunction = dyn_cast<gpu::GPUFuncOp>(kernelFunc);
-    if (!kernelGPUFunction)
-      return success();
-
-    unsigned actualNumArguments = launchOp.getNumKernelOperands();
-    unsigned expectedNumArguments = kernelGPUFunction.getNumArguments();
-    if (expectedNumArguments != actualNumArguments)
-      return launchOp.emitOpError("got ")
-             << actualNumArguments << " kernel operands but expected "
-             << expectedNumArguments;
-
-    auto functionType = kernelGPUFunction.getFunctionType();
-    for (unsigned i = 0; i < expectedNumArguments; ++i) {
-      if (launchOp.getKernelOperand(i).getType() != functionType.getInput(i)) {
-        return launchOp.emitOpError("type of function argument ")
-               << i << " does not match";
-      }
-    }
-
-    return success();
-  });
-
-  return walkResult.wasInterrupted() ? failure() : success();
+  return success();
 }
 
 /// Parses an optional list of async operands with an optional leading keyword.
@@ -1397,6 +1321,90 @@ LogicalResult LaunchFuncOp::verify() {
   return success();
 }
 
+LogicalResult
+LaunchFuncOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  LaunchFuncOp launchOp = *this;
+  Operation *table = SymbolTable::getNearestSymbolTable(launchOp);
+  // GPU modules cannot be nested within each other, escape to resolve the name.
+  if (isa<GPUModuleOp>(table))
+    table = SymbolTable::getNearestSymbolTable(table->getParentOp());
+
+  // Ignore launches that are nested more or less deep than functions in the
+  // module we are currently checking.
+  if (!launchOp->getParentOp() ||
+      launchOp->getParentOp()->getParentOp() != table)
+    return success();
+
+  // Ignore launch ops with missing attributes here. The errors will be
+  // reported by the verifiers of those ops.
+  if (!launchOp->getAttrOfType<SymbolRefAttr>(
+          LaunchFuncOp::getKernelAttrName(launchOp->getName())))
+    return success();
+
+  // Check that `launch_func` refers to a well-formed GPU kernel container.
+  StringAttr kernelContainerName = launchOp.getKernelModuleName();
+  Operation *kernelContainer =
+      symbolTable.lookupNearestSymbolFrom(table, kernelContainerName);
+  if (!kernelContainer)
+    return launchOp.emitOpError()
+           << "kernel container '" << kernelContainerName.getValue()
+           << "' is undefined";
+
+  // If the container is a GPU binary op return success.
+  if (isa<BinaryOp>(kernelContainer))
+    return success();
+
+  auto kernelModule = dyn_cast<GPUModuleOp>(kernelContainer);
+  if (!kernelModule)
+    return launchOp.emitOpError()
+           << "kernel module '" << kernelContainerName.getValue()
+           << "' is undefined";
+
+  // Check that `launch_func` refers to a well-formed kernel function.
+  Operation *kernelFunc = symbolTable.lookupNearestSymbolFrom(
+      kernelModule, launchOp.getKernelName());
+  if (!kernelFunc)
+    return launchOp.emitOpError("kernel function '")
+           << launchOp.getKernel() << "' is undefined";
+  auto kernelConvertedFunction = dyn_cast<FunctionOpInterface>(kernelFunc);
+  if (!kernelConvertedFunction) {
+    InFlightDiagnostic diag = launchOp.emitOpError()
+                              << "referenced kernel '" << launchOp.getKernel()
+                              << "' is not a function";
+    diag.attachNote(kernelFunc->getLoc()) << "see the kernel definition here";
+    return diag;
+  }
+
+  if (!kernelFunc->getAttrOfType<mlir::UnitAttr>(
+          GPUDialect::getKernelFuncAttrName()))
+    return launchOp.emitOpError("kernel function is missing the '")
+           << GPUDialect::getKernelFuncAttrName() << "' attribute";
+
+  // TODO: If the kernel isn't a GPU function (which happens during separate
+  // compilation), do not check type correspondence as it would require the
+  // verifier to be aware of the type conversion.
+  auto kernelGPUFunction = dyn_cast<gpu::GPUFuncOp>(kernelFunc);
+  if (!kernelGPUFunction)
+    return success();
+
+  unsigned actualNumArguments = launchOp.getNumKernelOperands();
+  unsigned expectedNumArguments = kernelGPUFunction.getNumArguments();
+  if (expectedNumArguments != actualNumArguments)
+    return launchOp.emitOpError("got ")
+           << actualNumArguments << " kernel operands but expected "
+           << expectedNumArguments;
+
+  FunctionType functionType = kernelGPUFunction.getFunctionType();
+  for (unsigned i = 0; i < expectedNumArguments; ++i) {
+    if (launchOp.getKernelOperand(i).getType() != functionType.getInput(i)) {
+      return launchOp.emitOpError("type of function argument ")
+             << i << " does not match";
+    }
+  }
+
+  return success();
+}
+
 static ParseResult
 parseLaunchDimType(OpAsmParser &parser, Type &dimTy,
                    std::optional<OpAsmParser::UnresolvedOperand> clusterValue,
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index c27749c36887d..1bddc286b3dd4 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -251,8 +251,12 @@ static bool isaElemwiseSingleUnaryOrBinaryOpInterface(linalg::GenericOp op,
   if (body->getOperations().size() != 2)
     return false;
 
+  // The payload op must have one result and at least arity-many operands
+  // (otherwise not all inputs can be used). It can have additional operands
+  // from outside of the generic op (e.g. div(1, x) for linalg.reciprocal) or
+  // use an input more than once (e.g. mul(x, x) for linalg.square).
   Operation *oper = &body->front();
-  if (oper->getNumOperands() != arity || oper->getNumResults() != 1)
+  if (oper->getNumOperands() < arity || oper->getNumResults() != 1)
     return false;
 
   auto yieldOp = dyn_cast<linalg::YieldOp>(body->back());
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index d84408c024e25..d751488d186ad 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -505,32 +505,12 @@ transform::DecomposeOp::applyToOne(transform::TransformRewriter &rewriter,
                                    LinalgOp target,
                                    transform::ApplyToEachResultList &results,
                                    transform::TransformState &state) {
-#define DOWNSCALE(trans)                                                       \
-  {                                                                            \
-    FailureOr<LinalgOp> res = tryApply<trans>(target);                         \
-    if (succeeded(res)) {                                                      \
-      results.push_back(*res);                                                 \
-      return DiagnosedSilenceableFailure::success();                           \
-    }                                                                          \
-  }
-
-#define DOWNSCALE_CALL(a, b) DownscaleSizeOneWindowed2DConvolution<a, b>
-#define DOWNSCALE_NORMAL(a, b) DOWNSCALE(DOWNSCALE_CALL(a, b))
-
-  DOWNSCALE_NORMAL(Conv2DNhwcHwcfOp, Conv1DNwcWcfOp)
-  DOWNSCALE_NORMAL(Conv2DNchwFchwOp, Conv1DNcwFcwOp)
-  DOWNSCALE_NORMAL(PoolingNhwcSumOp, PoolingNwcSumOp)
-  DOWNSCALE_NORMAL(PoolingNchwSumOp, PoolingNcwSumOp)
-  DOWNSCALE_NORMAL(PoolingNhwcMaxOp, PoolingNwcMaxOp)
-  DOWNSCALE_NORMAL(PoolingNhwcMaxUnsignedOp, PoolingNwcMaxUnsignedOp)
-  DOWNSCALE_NORMAL(PoolingNhwcMinOp, PoolingNwcMinOp)
-  DOWNSCALE_NORMAL(PoolingNhwcMinUnsignedOp, PoolingNwcMinUnsignedOp)
-  DOWNSCALE_NORMAL(PoolingNchwMaxOp, PoolingNcwMaxOp)
-  DOWNSCALE(DownscaleDepthwiseConv2DNhwcHwcOp)
-  DOWNSCALE(DownscaleConv2DOp)
-#undef DOWNSCALE_NORMAL
-#undef DOWNSCALE_CALL
-#undef DOWNSCALE
+  FailureOr<linalg::LinalgOp> res =
+      downscaleSizeOneWindowedConvolution(rewriter, target);
+  if (succeeded(res)) {
+    results.push_back(*res);
+    return DiagnosedSilenceableFailure::success();
+  }
   return emitDefaultSilenceableFailure(target);
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp b/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp
index b1c0c3b161b20..0be128c3b5e87 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/FoldIntoElementwise.cpp
@@ -29,7 +29,27 @@ using namespace mlir::linalg;
 #define DEBUG_TYPE "linalg-fold-into-elementwise"
 
 namespace {
-struct FoldTransposePattern : public OpRewritePattern<ElementwiseOp> {
+template <typename ProducerOpTy>
+struct ElementwiseOpFolder {
+  // Helper function to fold broadcast etc into elementwise op.
+  // Producer in this context is `broadcast op` etc, consumer is elwise operand.
+  static bool fold(OpOperand *elwiseOperand, AffineMap elwiseMap,
+                   SmallVector<Value> &newIns,
+                   SmallVector<AffineMap> &newMaps) {
+    auto producerOp = elwiseOperand->get().getDefiningOp<ProducerOpTy>();
+    if (!producerOp || !elwiseMap.isProjectedPermutation())
+      return false;
+    newIns.push_back(producerOp.getInput());
+    // push in the new composed affine map
+    newMaps.push_back(
+        producerOp.getMatchingIndexingMap(producerOp.getDpsInputOperand(0))
+            .compose(elwiseMap));
+    return true;
+  }
+};
+
+template <typename... ProducerOps>
+struct FoldIntoElementwisePattern : public OpRewritePattern<ElementwiseOp> {
   using OpRewritePattern<ElementwiseOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(ElementwiseOp op,
@@ -38,20 +58,17 @@ struct FoldTransposePattern : public OpRewritePattern<ElementwiseOp> {
     SmallVector<Value> newIns;
     SmallVector<AffineMap> newMaps;
     for (OpOperand *operand : op.getDpsInputOperands()) {
-      AffineMap map = op.getMatchingIndexingMap(operand);
-      auto transposeOp = operand->get().getDefiningOp<TransposeOp>();
-
-      if (!map.isIdentity() || !transposeOp) {
+      AffineMap consumerMap = op.getMatchingIndexingMap(operand);
+      const bool folded = (ElementwiseOpFolder<ProducerOps>::fold(
+                               operand, consumerMap, newIns, newMaps) ||
+                           ...);
+      if (folded) {
+        changed = true;
+      } else {
         // push in original operand and its map.
         newIns.push_back(operand->get());
-        newMaps.push_back(map);
-        continue;
+        newMaps.push_back(consumerMap);
       }
-      newIns.push_back(transposeOp.getInput());
-      // push in transposeOp's inverse permutation map.
-      newMaps.push_back(transposeOp.getMatchingIndexingMap(
-          transposeOp.getDpsInputOperand(0)));
-      changed = true;
     }
     if (!changed)
       return failure();
@@ -83,5 +100,6 @@ struct LinalgFoldIntoElementwisePass
 
 void mlir::linalg::populateLinalgFoldIntoElementwisePatterns(
     RewritePatternSet &patterns) {
-  patterns.add<FoldTransposePattern>(patterns.getContext());
+  patterns.add<FoldIntoElementwisePattern<TransposeOp, BroadcastOp>>(
+      patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
index ee9fc77961bab..60b18fb2e8d93 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -36,9 +36,9 @@ namespace mlir {
       ValueRange{genericOp.getDpsInits()[0]}))
 
 #define REPLACE_UNARY_OP(NEWOP)                                                \
-  (rewriter.replaceOpWithNewOp<NEWOP>(genericOp,                               \
-                                      ValueRange{genericOp.getDpsInputs()[0]}, \
-                                      ValueRange{genericOp.getDpsInits()[0]}))
+  static_cast<LinalgOp>(rewriter.replaceOpWithNewOp<NEWOP>(                    \
+      genericOp, ValueRange{genericOp.getDpsInputs()[0]},                      \
+      ValueRange{genericOp.getDpsInits()[0]}))
 
 using namespace mlir;
 using namespace mlir::linalg;
@@ -508,10 +508,37 @@ FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(
   // Elementwise Unary
   if (isaElemwiseSingleUnaryOpInterface(genericOp)) {
     Operation *op = &genericOp.getBody()->front();
-    if (isa<math::ExpOp>(op)) {
-      LinalgOp namedOp = REPLACE_UNARY_OP(ExpOp);
-      return namedOp;
+    if (isa<math::ExpOp>(op))
+      return REPLACE_UNARY_OP(ExpOp);
+    if (isa<math::LogOp>(op))
+      return REPLACE_UNARY_OP(LogOp);
+    if (isa<math::AbsFOp>(op))
+      return REPLACE_UNARY_OP(AbsOp);
+    if (isa<math::CeilOp>(op))
+      return REPLACE_UNARY_OP(CeilOp);
+    if (isa<math::FloorOp>(op))
+      return REPLACE_UNARY_OP(FloorOp);
+    if (isa<arith::NegFOp>(op))
+      return REPLACE_UNARY_OP(NegFOp);
+    if (auto divOp = dyn_cast<arith::DivFOp>(op)) {
+      if (auto constOp = dyn_cast_if_present<arith::ConstantOp>(
+              divOp.getLhs().getDefiningOp()))
+        if (cast<FloatAttr>(constOp.getValue()).getValue().isExactlyValue(1.0))
+          return REPLACE_UNARY_OP(ReciprocalOp);
     }
+    if (isa<math::RoundOp>(op))
+      return REPLACE_UNARY_OP(RoundOp);
+    if (isa<math::SqrtOp>(op))
+      return REPLACE_UNARY_OP(SqrtOp);
+    if (isa<math::RsqrtOp>(op))
+      return REPLACE_UNARY_OP(RsqrtOp);
+    if (auto mulOp = dyn_cast<arith::MulFOp>(op);
+        mulOp && mulOp.getLhs() == mulOp.getRhs())
+      return REPLACE_UNARY_OP(SquareOp);
+    if (isa<math::TanhOp>(op))
+      return REPLACE_UNARY_OP(TanhOp);
+    if (isa<math::ErfOp>(op))
+      return REPLACE_UNARY_OP(ErfOp);
   }
 
   // Elementwise Binary
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 2b4986aeac14f..260e36fb47f04 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1424,289 +1424,213 @@ LogicalResult DecomposeOuterUnitDimsUnPackOpPattern::matchAndRewrite(
   return success();
 }
 
-// The following are patterns for downscaling convolution ops with size-1
-// window dimensions.
+//===----------------------------------------------------------------------===//
+// Generic DownscaleSizeOneWindowedConvolution
+//===----------------------------------------------------------------------===//
 //
-// Note that we'd eventually want to write such transformations in a generic
-// way, e.g., converting to linalg.generic, removing the size-1 dimensions,
-// and then turning back to named ops. But for now it's fine to have a few
-// patterns matching special ops to get started.
-
-template <typename Conv2DOp, typename Conv1DOp>
-FailureOr<Conv1DOp> DownscaleSizeOneWindowed2DConvolution<Conv2DOp, Conv1DOp>::
-    returningMatchAndRewrite(LinalgOp convOp, PatternRewriter &rewriter) const {
-  // Check if this LinalgOp is of the expected Conv2DOp type (named or generic).
-  std::optional<DilationsAndStrides> convParams =
-      matchConvolutionOpOfType<Conv2DOp>(convOp);
-  if (!convParams)
-    return failure();
-  SmallVector<int64_t> dilations = std::move(convParams->dilations);
-  SmallVector<int64_t> strides = std::move(convParams->strides);
-
-  if (convOp.hasPureBufferSemantics())
-    return failure(); // To be implemented.
-
-  Value input = convOp.getDpsInputs().front();
-  Value kernel = convOp.getDpsInputs().back();
-  Value output = convOp.getDpsInits().front();
-
-  auto inputType = dyn_cast<RankedTensorType>(input.getType());
-  auto kernelType = dyn_cast<RankedTensorType>(kernel.getType());
-  auto outputType = dyn_cast<RankedTensorType>(output.getType());
-
-  auto kernelShape = kernelType.getShape();
-  auto outputShape = outputType.getShape();
-
-  // Get domain indices based on Conv2DOp type. These are known at compile time.
-  int64_t khIndex, kwIndex, ohIndex, owIndex;
-  if constexpr (std::is_same_v<Conv2DOp, linalg::Conv2DNhwcHwcfOp> ||
-                std::is_same_v<Conv2DOp, linalg::PoolingNhwcSumOp> ||
-                std::is_same_v<Conv2DOp, linalg::PoolingNhwcMaxOp> ||
-                std::is_same_v<Conv2DOp, linalg::PoolingNhwcMaxUnsignedOp> ||
-                std::is_same_v<Conv2DOp, linalg::PoolingNhwcMinOp> ||
-                std::is_same_v<Conv2DOp, linalg::PoolingNhwcMinUnsignedOp>) {
-    // NHWC layout: kernel [H, W, ...], output [N, H, W, C]
-    khIndex = 0;
-    kwIndex = 1;
-    ohIndex = 1;
-    owIndex = 2;
-  } else if constexpr (std::is_same_v<Conv2DOp, linalg::Conv2DNchwFchwOp>) {
-    // NCHW_FCHW layout: kernel [..., H, W], output [N, C, H, W]
-    khIndex = 2;
-    kwIndex = 3;
-    ohIndex = 2;
-    owIndex = 3;
-  } else if constexpr (std::is_same_v<Conv2DOp, linalg::PoolingNchwSumOp> ||
-                       std::is_same_v<Conv2DOp, linalg::PoolingNchwMaxOp>) {
-    // NCHW pooling layout: kernel [H, W], output [N, C, H, W]
-    khIndex = 0;
-    kwIndex = 1;
-    ohIndex = 2;
-    owIndex = 3;
+/// Returns the indices of affine map results that reference any of the given
+/// dimensions.
+static SmallVector<unsigned>
+getResultIndicesReferencingDims(AffineMap map, ArrayRef<unsigned> dims) {
+  SmallVector<unsigned> resultIndices;
+  for (unsigned dim : dims) {
+    for (unsigned i = 0, e = map.getNumResults(); i < e; ++i) {
+      AffineExpr expr = map.getResult(i);
+      if (expr.isFunctionOfDim(dim)) {
+        resultIndices.push_back(i);
+        break;
+      }
+    }
   }
+  return resultIndices;
+}
 
-  // Only handle the case where at least one of the window dimensions is
-  // of size 1. Other cases can rely on tiling to reduce to such cases.
-  int64_t khSize = kernelShape[khIndex], kwSize = kernelShape[kwIndex];
-  int64_t ohSize = outputShape[ohIndex], owSize = outputShape[owIndex];
-  bool removeH = (khSize == 1 && ohSize == 1);
-  bool removeW = (kwSize == 1 && owSize == 1);
-  if (!removeH && !removeW)
-    return failure();
+/// Helper to create a rank-reducing extract_slice that removes specific
+/// dimensions from a tensor.
+static Value createRankReducingExtractSlice(RewriterBase &rewriter,
+                                            Location loc, Value tensor,
+                                            ArrayRef<unsigned> dimsToRemove) {
+  auto tensorType = cast<RankedTensorType>(tensor.getType());
+  int64_t rank = tensorType.getRank();
+
+  // Compute new shape by removing the specified dimensions.
+  SmallVector<int64_t> newShape;
+  for (int64_t i = 0; i < rank; ++i) {
+    if (!llvm::is_contained(dimsToRemove, i))
+      newShape.push_back(tensorType.getDimSize(i));
+  }
 
-  // Get new shapes and types for all operands by removing the size-1
-  // dimension.
-  using RTTBuilder = RankedTensorType::Builder;
-  RankedTensorType newInputType =
-      RTTBuilder(inputType).dropDim((removeH ? ohIndex : owIndex));
-  RankedTensorType newKernelType =
-      RTTBuilder(kernelType).dropDim((removeH ? khIndex : kwIndex));
-  RankedTensorType newOutputType =
-      RTTBuilder(outputType).dropDim((removeH ? ohIndex : owIndex));
-
-  // Rank-reduce operands.
-  Location loc = convOp.getLoc();
-  Value newInput = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, input, newInputType);
-  Value newKernel = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, kernel, newKernelType);
-  Value newOutput = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, output, newOutputType);
-
-  // Rank-reduce strides and dilations too.
-  // TODO: dropDim 1-liner helper.
-  strides.erase(strides.begin() + (removeH ? 0 : 1));
-  auto stridesAttr = rewriter.getI64VectorAttr(strides);
-
-  dilations.erase(dilations.begin() + (removeH ? 0 : 1));
-  auto dilationsAttr = rewriter.getI64VectorAttr(dilations);
-
-  auto conv1DOp = Conv1DOp::create(
-      rewriter, loc, newOutputType, ValueRange{newInput, newKernel},
-      ValueRange{newOutput}, stridesAttr, dilationsAttr);
-
-  // Insert back.
-  Value inserted = tensor::createCanonicalRankReducingInsertSliceOp(
-      rewriter, loc, conv1DOp.getResult(0), output);
-  rewriter.replaceOp(convOp, inserted);
-
-  return conv1DOp;
+  auto newType = RankedTensorType::get(newShape, tensorType.getElementType());
+  return tensor::createCanonicalRankReducingExtractSliceOp(rewriter, loc,
+                                                           tensor, newType);
 }
 
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<Conv2DNhwcHwcfOp,
-                                                              Conv1DNwcWcfOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<Conv2DNchwFchwOp,
-                                                              Conv1DNcwFcwOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<PoolingNhwcSumOp,
-                                                              PoolingNwcSumOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<PoolingNchwSumOp,
-                                                              PoolingNcwSumOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<PoolingNhwcMaxOp,
-                                                              PoolingNwcMaxOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<
-    PoolingNhwcMaxUnsignedOp, PoolingNwcMaxUnsignedOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<PoolingNhwcMinOp,
-                                                              PoolingNwcMinOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<
-    PoolingNhwcMinUnsignedOp, PoolingNwcMinUnsignedOp>;
-template struct linalg::DownscaleSizeOneWindowed2DConvolution<PoolingNchwMaxOp,
-                                                              PoolingNcwMaxOp>;
-
-FailureOr<DepthwiseConv1DNwcWcOp>
-DownscaleDepthwiseConv2DNhwcHwcOp::returningMatchAndRewrite(
-    LinalgOp convOp, PatternRewriter &rewriter) const {
-  // Check if this LinalgOp is a DepthwiseConv2DNhwcHwcOp (named or generic).
-  std::optional<DilationsAndStrides> convParams =
-      matchConvolutionOpOfType<DepthwiseConv2DNhwcHwcOp>(convOp);
-  if (!convParams)
+/// Drops specified dimensions from an AffineExpr and compresses remaining
+/// dimension indices. Returns std::nullopt if the expression only references
+/// the dropped dimensions.
+static std::optional<AffineExpr>
+dropDimsAndCompress(AffineExpr expr, ArrayRef<unsigned> dimsToDrop,
+                    unsigned newNumDims, MLIRContext *ctx) {
+  // Check if expr only references dimensions to be dropped.
+  bool onlyReferencesDroppedDims = true;
+  for (unsigned d = 0; d < newNumDims + dimsToDrop.size(); ++d) {
+    if (expr.isFunctionOfDim(d) && !llvm::is_contained(dimsToDrop, d)) {
+      onlyReferencesDroppedDims = false;
+      break;
+    }
+  }
+  if (onlyReferencesDroppedDims && llvm::any_of(dimsToDrop, [&](unsigned d) {
+        return expr.isFunctionOfDim(d);
+      }))
+    return std::nullopt;
+
+  // Replace dimensions: compute new index for each old dimension.
+  // Dropped dimensions get mapped to constant 0, others get compressed.
+  SmallVector<AffineExpr> dimReplacements;
+  unsigned newDimIdx = 0;
+  for (unsigned d = 0; d < newNumDims + dimsToDrop.size(); ++d) {
+    if (llvm::is_contained(dimsToDrop, d)) {
+      dimReplacements.push_back(getAffineConstantExpr(0, ctx));
+    } else {
+      dimReplacements.push_back(getAffineDimExpr(newDimIdx++, ctx));
+    }
+  }
+
+  return expr.replaceDims(dimReplacements);
+}
+
+FailureOr<LinalgOp>
+linalg::downscaleSizeOneWindowedConvolution(RewriterBase &rewriter,
+                                            LinalgOp op) {
+  auto maybeDims = inferConvolutionDims(op);
+  if (failed(maybeDims))
     return failure();
-  SmallVector<int64_t> dilations = std::move(convParams->dilations);
-  SmallVector<int64_t> strides = std::move(convParams->strides);
-
-  if (convOp.hasPureBufferSemantics())
-    return failure(); // To be implemented.
-
-  Value input = convOp.getDpsInputs().front();
-  Value kernel = convOp.getDpsInputs().back();
-  Value output = convOp.getDpsInits().front();
-
-  auto inputType = dyn_cast<RankedTensorType>(input.getType());
-  auto kernelType = dyn_cast<RankedTensorType>(kernel.getType());
-  auto outputType = dyn_cast<RankedTensorType>(output.getType());
-
-  auto kernelShape = kernelType.getShape();
-  auto outputShape = outputType.getShape();
-
-  // Only handle the case where at least one of the window dimensions is
-  // of size 1. Other cases can rely on tiling to reduce to such cases.
-  int64_t khSize = kernelShape[0], kwSize = kernelShape[1];
-  int64_t ohSize = outputShape[1], owSize = outputShape[2];
-  bool removeH = (khSize == 1 && ohSize == 1);
-  bool removeW = (kwSize == 1 && owSize == 1);
-  if (!removeH && !removeW)
+
+  // Currently supports only 2D convolutions.
+  if (maybeDims->outputImage.size() != 2 || maybeDims->filterLoop.size() != 2)
     return failure();
 
-  // Get new shapes and types for all operands by removing the size-1
-  // dimension.
-  using RTTBuilder = RankedTensorType::Builder;
-  RankedTensorType newInputType =
-      RTTBuilder(inputType).dropDim((removeH ? 1 : 2));
-  RankedTensorType newKernelType =
-      RTTBuilder(kernelType).dropDim((removeH ? 0 : 1));
-  RankedTensorType newOutputType =
-      RTTBuilder(outputType).dropDim(removeH ? 1 : 2);
-
-  // Rank-reduce operands.
-  Location loc = convOp.getLoc();
-  Value newInput = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, input, newInputType);
-  Value newKernel = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, kernel, newKernelType);
-  Value newOutput = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, output, newOutputType);
-
-  // Rank-reduce strides and dilations too.
-  // TODO: dropDim 1-liner helper.
-  strides.erase(strides.begin() + (removeH ? 0 : 1));
-  auto stridesAttr = rewriter.getI64VectorAttr(strides);
-
-  dilations.erase(dilations.begin() + (removeH ? 0 : 1));
-  auto dilationsAttr = rewriter.getI64VectorAttr(dilations);
-
-  auto conv1DOp = DepthwiseConv1DNwcWcOp::create(
-      rewriter, loc, newOutputType, ValueRange{newInput, newKernel},
-      ValueRange{newOutput}, stridesAttr, dilationsAttr);
-
-  // Insert back.
-  Value inserted = tensor::createCanonicalRankReducingInsertSliceOp(
-      rewriter, loc, conv1DOp.getResult(0), output);
-  rewriter.replaceOp(convOp, inserted);
-
-  return conv1DOp;
-}
+  if (op.hasPureBufferSemantics())
+    return failure();
 
-FailureOr<Conv1DOp>
-DownscaleConv2DOp::returningMatchAndRewrite(LinalgOp convOp,
-                                            PatternRewriter &rewriter) const {
-  // Check if this LinalgOp is a Conv2DOp (named or generic).
-  std::optional<DilationsAndStrides> convParams =
-      matchConvolutionOpOfType<Conv2DOp>(convOp);
-  if (!convParams)
+  // Get loop domain indices for spatial dimensions.
+  unsigned outSpatial0 = maybeDims->outputImage[0];
+  unsigned outSpatial1 = maybeDims->outputImage[1];
+  unsigned filterSpatial0 = maybeDims->filterLoop[0];
+  unsigned filterSpatial1 = maybeDims->filterLoop[1];
+
+  // Get sizes from loop bounds.
+  SmallVector<int64_t, 4> loopRanges = op.getStaticLoopRanges();
+  int64_t outSize0 = loopRanges[outSpatial0];
+  int64_t outSize1 = loopRanges[outSpatial1];
+  int64_t filterSize0 = loopRanges[filterSpatial0];
+  int64_t filterSize1 = loopRanges[filterSpatial1];
+
+  // Check if we can downscale by removing a spatial dimension.
+  bool canRemoveSpatial0 = (filterSize0 == 1 && outSize0 == 1);
+  bool canRemoveSpatial1 = (filterSize1 == 1 && outSize1 == 1);
+  if (!canRemoveSpatial0 && !canRemoveSpatial1)
     return failure();
 
-  if (convOp.hasPureBufferSemantics())
-    return failure(); // To be implemented.
+  // Determine which loop dims to remove (output spatial + corresponding filter)
+  // and sort for correct index compression when removing dimensions from affine
+  // maps.
+  SmallVector<unsigned> loopDimsToRemove;
+  if (canRemoveSpatial0) {
+    loopDimsToRemove.push_back(outSpatial0);
+    loopDimsToRemove.push_back(filterSpatial0);
+  } else {
+    loopDimsToRemove.push_back(outSpatial1);
+    loopDimsToRemove.push_back(filterSpatial1);
+  }
+  llvm::sort(loopDimsToRemove);
 
-  Value input = convOp.getDpsInputs().front();
-  Value kernel = convOp.getDpsInputs().back();
-  Value output = convOp.getDpsInits().front();
+  // Create new indexing maps with dimensions removed.
+  SmallVector<AffineMap> newMaps;
+  MLIRContext *ctx = op.getContext();
+  unsigned numDims = op.getNumLoops();
+  unsigned newNumDims = numDims - loopDimsToRemove.size();
+  for (AffineMap map : op.getIndexingMapsArray()) {
+    SmallVector<AffineExpr> newResults;
+    for (AffineExpr expr : map.getResults()) {
+      auto newExpr =
+          dropDimsAndCompress(expr, loopDimsToRemove, newNumDims, ctx);
+      if (newExpr)
+        newResults.push_back(*newExpr);
+    }
+    newMaps.push_back(AffineMap::get(newNumDims, 0, newResults, ctx));
+  }
 
-  auto inputType = dyn_cast<RankedTensorType>(input.getType());
-  auto kernelType = dyn_cast<RankedTensorType>(kernel.getType());
-  auto outputType = dyn_cast<RankedTensorType>(output.getType());
+  // Create new iterator types.
+  SmallVector<utils::IteratorType> newIterTypes;
+  auto iterTypes = op.getIteratorTypesArray();
+  for (unsigned idx = 0; idx < iterTypes.size(); ++idx) {
+    if (!llvm::is_contained(loopDimsToRemove, idx))
+      newIterTypes.push_back(iterTypes[idx]);
+  }
 
-  auto kernelShape = kernelType.getShape();
-  auto outputShape = outputType.getShape();
+  // Rank-reduce operands using extract_slice.
+  Location loc = op.getLoc();
+  SmallVector<Value> newInputs;
+  for (OpOperand *input : op.getDpsInputOperands()) {
+    AffineMap map = op.getMatchingIndexingMap(input);
+    SmallVector<unsigned> tensorDimsToRemove =
+        getResultIndicesReferencingDims(map, loopDimsToRemove);
+    Value reduced = createRankReducingExtractSlice(rewriter, loc, input->get(),
+                                                   tensorDimsToRemove);
+    newInputs.push_back(reduced);
+  }
 
-  // Only handle the case where at least one of the window dimensions is
-  // of size 1. Other cases can rely on tiling to reduce to such cases.
-  int64_t khSize = kernelShape[0], kwSize = kernelShape[1];
-  int64_t ohSize = outputShape[0], owSize = outputShape[1];
-  bool removeH = (khSize == 1 && ohSize == 1);
-  bool removeW = (kwSize == 1 && owSize == 1);
-  if (!removeH && !removeW)
-    return failure();
+  OpOperand &output = *op.getDpsInitsMutable().begin();
+  AffineMap outputMap = op.getMatchingIndexingMap(&output);
+  SmallVector<unsigned> outputDimsToRemove =
+      getResultIndicesReferencingDims(outputMap, loopDimsToRemove);
+  Value newOutput = createRankReducingExtractSlice(rewriter, loc, output.get(),
+                                                   outputDimsToRemove);
+
+  // Create new linalg.generic with reduced dimensions.
+  auto newOp =
+      linalg::GenericOp::create(rewriter, loc, TypeRange{newOutput.getType()},
+                                newInputs, newOutput, newMaps, newIterTypes);
+  rewriter.inlineRegionBefore(op->getRegion(0), newOp.getRegion(),
+                              newOp.getRegion().begin());
+
+  // Try to specialize the generic back to a named op only if the input was
+  // already a specialized (named) op.
+  LinalgOp resultOp = newOp;
+  if (!isa<GenericOp>(op)) {
+    FailureOr<LinalgOp> specializedOp = specializeGenericOp(rewriter, newOp);
+    if (succeeded(specializedOp))
+      resultOp = *specializedOp;
+  }
 
-  // Get new shapes and types for all operands by removing the size-1
-  // dimension.
-  using RTTBuilder = RankedTensorType::Builder;
-  RankedTensorType newInputType =
-      RTTBuilder(inputType).dropDim((removeH ? 0 : 1));
-  RankedTensorType newKernelType =
-      RTTBuilder(kernelType).dropDim((removeH ? 0 : 1));
-  RankedTensorType newOutputType =
-      RTTBuilder(outputType).dropDim(removeH ? 0 : 1);
-
-  // Rank-reduce operands.
-  Location loc = convOp.getLoc();
-  Value newInput = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, input, newInputType);
-  Value newKernel = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, kernel, newKernelType);
-  Value newOutput = tensor::createCanonicalRankReducingExtractSliceOp(
-      rewriter, loc, output, newOutputType);
-
-  auto conv1DOp =
-      Conv1DOp::create(rewriter, loc, newOutputType,
-                       ValueRange{newInput, newKernel}, ValueRange{newOutput});
-
-  // Insert back.
-  Value inserted = tensor::createCanonicalRankReducingInsertSliceOp(
-      rewriter, loc, conv1DOp.getResult(0), output);
-  rewriter.replaceOp(convOp, inserted);
-
-  return conv1DOp;
+  // Insert result back into original shape.
+  Value result = tensor::createCanonicalRankReducingInsertSliceOp(
+      rewriter, loc, resultOp->getResult(0), output.get());
+
+  rewriter.replaceOp(op, result);
+  return resultOp;
 }
 
+namespace {
+/// Pattern wrapper around `downscaleSizeOneWindowedConvolution`.
+struct DownscaleSizeOneWindowedConvolution final
+    : public OpInterfaceRewritePattern<LinalgOp> {
+  DownscaleSizeOneWindowedConvolution(MLIRContext *context,
+                                      PatternBenefit benefit = 1)
+      : OpInterfaceRewritePattern<LinalgOp>(context, benefit) {}
+
+  LogicalResult matchAndRewrite(LinalgOp op,
+                                PatternRewriter &rewriter) const override {
+    return linalg::downscaleSizeOneWindowedConvolution(rewriter, op);
+  }
+};
+} // namespace
+
 void linalg::populateDecomposeConvolutionPatterns(RewritePatternSet &patterns,
                                                   PatternBenefit benefit) {
-  patterns.add<DownscaleSizeOneWindowed2DConvolution<linalg::Conv2DNhwcHwcfOp,
-                                                     Conv1DNwcWcfOp>,
-               DownscaleSizeOneWindowed2DConvolution<linalg::Conv2DNchwFchwOp,
-                                                     Conv1DNcwFcwOp>,
-               DownscaleDepthwiseConv2DNhwcHwcOp, DownscaleConv2DOp>(
-      patterns.getContext(), benefit);
-  patterns.add<
-      DownscaleSizeOneWindowed2DConvolution<PoolingNhwcSumOp, PoolingNwcSumOp>,
-      DownscaleSizeOneWindowed2DConvolution<PoolingNchwSumOp, PoolingNcwSumOp>,
-      DownscaleSizeOneWindowed2DConvolution<PoolingNhwcMaxOp, PoolingNwcMaxOp>,
-      DownscaleSizeOneWindowed2DConvolution<PoolingNhwcMaxUnsignedOp,
-                                            PoolingNwcMaxUnsignedOp>,
-      DownscaleSizeOneWindowed2DConvolution<PoolingNhwcMinOp, PoolingNwcMinOp>,
-      DownscaleSizeOneWindowed2DConvolution<PoolingNhwcMinUnsignedOp,
-                                            PoolingNwcMinUnsignedOp>,
-      DownscaleSizeOneWindowed2DConvolution<PoolingNchwMaxOp, PoolingNcwMaxOp>>(
-      patterns.getContext(), benefit);
+  patterns.add<DownscaleSizeOneWindowedConvolution>(patterns.getContext(),
+                                                    benefit);
 }
 
 void linalg::populateDecomposePackUnpackPatterns(RewritePatternSet &patterns) {
diff --git a/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp b/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
index 9d6ad613fc945..bc262f84b26ac 100644
--- a/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/ExtendToSupportedTypes.cpp
@@ -124,28 +124,25 @@ void ExtendToSupportedTypesPass::runOnOperation() {
   MLIRContext *ctx = &getContext();
 
   // Parse target type
-  std::optional<Type> maybeTargetType =
-      arith::parseFloatType(ctx, targetTypeStr);
-  if (!maybeTargetType.has_value()) {
+  FloatType targetType = arith::parseFloatType(ctx, targetTypeStr);
+  if (!targetType) {
     emitError(UnknownLoc::get(ctx), "could not map target type '" +
                                         targetTypeStr +
                                         "' to a known floating-point type");
     return signalPassFailure();
   }
-  Type targetType = maybeTargetType.value();
 
   // Parse source types
   llvm::SetVector<Type> sourceTypes;
   for (const auto &extraTypeStr : extraTypeStrs) {
-    std::optional<FloatType> maybeExtraType =
-        arith::parseFloatType(ctx, extraTypeStr);
-    if (!maybeExtraType.has_value()) {
+    FloatType extraType = arith::parseFloatType(ctx, extraTypeStr);
+    if (!extraType) {
       emitError(UnknownLoc::get(ctx), "could not map source type '" +
                                           extraTypeStr +
                                           "' to a known floating-point type");
       return signalPassFailure();
     }
-    sourceTypes.insert(maybeExtraType.value());
+    sourceTypes.insert(extraType);
   }
   // f64 and f32 are implicitly supported
   Builder b(ctx);
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index e0559e850faf6..7cab929d583ca 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -18,6 +18,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/IR/OperationSupport.h"
 #include "mlir/IR/SymbolTable.h"
@@ -4796,6 +4797,30 @@ LogicalResult IteratorOp::verify() {
   if (!iteratedTy)
     return emitOpError() << "result must be omp.iterated<entry_ty>";
 
+  for (auto [lb, ub, step] : llvm::zip_equal(
+           getLoopLowerBounds(), getLoopUpperBounds(), getLoopSteps())) {
+    if (matchPattern(step, m_Zero()))
+      return emitOpError() << "loop step must not be zero";
+
+    IntegerAttr lbAttr;
+    IntegerAttr ubAttr;
+    IntegerAttr stepAttr;
+    if (!matchPattern(lb, m_Constant(&lbAttr)) ||
+        !matchPattern(ub, m_Constant(&ubAttr)) ||
+        !matchPattern(step, m_Constant(&stepAttr)))
+      continue;
+
+    const APInt &lbVal = lbAttr.getValue();
+    const APInt &ubVal = ubAttr.getValue();
+    const APInt &stepVal = stepAttr.getValue();
+    if (stepVal.isStrictlyPositive() && lbVal.sgt(ubVal))
+      return emitOpError() << "positive loop step requires lower bound to be "
+                              "less than or equal to upper bound";
+    if (stepVal.isNegative() && lbVal.slt(ubVal))
+      return emitOpError() << "negative loop step requires lower bound to be "
+                              "greater than or equal to upper bound";
+  }
+
   Block &b = getRegion().front();
   auto yield = llvm::dyn_cast<omp::YieldOp>(b.getTerminator());
 
diff --git a/mlir/lib/IR/BuiltinDialectBytecode.cpp b/mlir/lib/IR/BuiltinDialectBytecode.cpp
index f7430784dd222..14dc665184099 100644
--- a/mlir/lib/IR/BuiltinDialectBytecode.cpp
+++ b/mlir/lib/IR/BuiltinDialectBytecode.cpp
@@ -33,23 +33,27 @@ namespace {
 
 // TODO: Move these to separate file.
 
-// Returns the bitwidth if known, else return 0.
-static unsigned getIntegerBitWidth(DialectBytecodeReader &reader, Type type) {
-  if (auto intType = dyn_cast<IntegerType>(type)) {
+// Returns the bitwidth if known, else return std::nullopt.
+static std::optional<unsigned> getIntegerBitWidth(DialectBytecodeReader &reader,
+                                                  Type type) {
+  if (auto intType = dyn_cast<IntegerType>(type))
     return intType.getWidth();
-  }
-  if (llvm::isa<IndexType>(type)) {
+  if (llvm::isa<IndexType>(type))
     return IndexType::kInternalStorageBitWidth;
-  }
   reader.emitError()
       << "expected integer or index type for IntegerAttr, but got: " << type;
-  return 0;
+  return std::nullopt;
 }
 
 static LogicalResult readAPIntWithKnownWidth(DialectBytecodeReader &reader,
                                              Type type, FailureOr<APInt> &val) {
-  unsigned bitWidth = getIntegerBitWidth(reader, type);
-  val = reader.readAPIntWithKnownWidth(bitWidth);
+  std::optional<unsigned> bitWidth = getIntegerBitWidth(reader, type);
+  // getIntegerBitWidth returns std::nullopt and emits an error for unsupported
+  // types. Bail out early to avoid creating a zero-width APInt with a non-zero
+  // value.
+  if (!bitWidth)
+    return failure();
+  val = reader.readAPIntWithKnownWidth(*bitWidth);
   return val;
 }
 
diff --git a/mlir/lib/IR/Diagnostics.cpp b/mlir/lib/IR/Diagnostics.cpp
index f4c9242ed3479..5caf826c84bdd 100644
--- a/mlir/lib/IR/Diagnostics.cpp
+++ b/mlir/lib/IR/Diagnostics.cpp
@@ -600,9 +600,17 @@ struct ExpectedDiag {
   /// Emit an error at the location referenced by this diagnostic.
   LogicalResult emitError(raw_ostream &os, llvm::SourceMgr &mgr,
                           const Twine &msg) {
-    SMRange range(fileLoc, SMLoc::getFromPointer(fileLoc.getPointer() +
-                                                 substring.size()));
-    mgr.PrintMessage(os, fileLoc, llvm::SourceMgr::DK_Error, msg, range);
+    // fileLoc may be invalid when the expected diagnostic used an unknown
+    // location specifier (e.g. `// expected-error @unknown {{...}}`). In that
+    // case, skip the source range to avoid a null-pointer dereference and an
+    // assertion in SMRange that both endpoints must have the same validity.
+    if (fileLoc.isValid()) {
+      SMRange range(fileLoc, SMLoc::getFromPointer(fileLoc.getPointer() +
+                                                   substring.size()));
+      mgr.PrintMessage(os, fileLoc, llvm::SourceMgr::DK_Error, msg, range);
+    } else {
+      mgr.PrintMessage(os, fileLoc, llvm::SourceMgr::DK_Error, msg);
+    }
     return failure();
   }
 
diff --git a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
index 2f95531455b2b..8464b633a2625 100644
--- a/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
+++ b/mlir/lib/Interfaces/ControlFlowInterfaces.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <map>
 #include <utility>
 
 #include "mlir/IR/BuiltinTypes.h"
@@ -930,18 +931,12 @@ struct RemoveDeadRegionBranchOpSuccessorInputs : public RewritePattern {
   }
 };
 
-/// Return "true" if the two values are owned by the same operation or block.
-static bool haveSameOwner(Value a, Value b) {
-  void *aOwner, *bOwner;
-  if (auto arg = dyn_cast<BlockArgument>(a))
-    aOwner = arg.getOwner();
-  else
-    aOwner = a.getDefiningOp();
-  if (auto arg = dyn_cast<BlockArgument>(b))
-    bOwner = arg.getOwner();
-  else
-    bOwner = b.getDefiningOp();
-  return aOwner == bOwner;
+/// Return the "owner" of a value: the parent block for block arguments, the
+/// defining op for op results.
+static void *getOwnerOfValue(Value value) {
+  if (auto arg = dyn_cast<BlockArgument>(value))
+    return arg.getOwner();
+  return value.getDefiningOp();
 }
 
 /// Get the block argument or op result number of the given value.
@@ -1006,39 +1001,58 @@ struct RemoveDuplicateSuccessorInputUses : public RewritePattern {
       return getArgOrResultNumber(a) < getArgOrResultNumber(b);
     });
 
-    // Check every distinct pair of successor inputs for duplicates. Replace
-    // `input2` with `input1` if they are duplicates.
+    // Group inputs by their operand "signature" to find duplicates. Two
+    // successor inputs are duplicates if each predecessor (region branch point)
+    // forwards the same value for both. Let n = number of successor inputs and
+    // k = number of predecessors per input. Instead of comparing every pair of
+    // inputs (O(n² * k)), we build a signature for each input and group them
+    // via a std::map.
+    //
+    // A signature is a sorted list of (predecessor, forwarded value) pairs.
+    // Within each group, all but the first (canonical) input are replaced with
+    // the canonical one.
+    using SigEntry = std::pair<Operation *, Value>;
+    using Signature = SmallVector<SigEntry>;
+    auto sigEntryLess = [](const SigEntry &a, const SigEntry &b) {
+      if (a.first != b.first)
+        return a.first < b.first;
+      return a.second.getAsOpaquePointer() < b.second.getAsOpaquePointer();
+    };
+    // The map key is (signature, owner). Two inputs are duplicates only if they
+    // have the same signature AND the same owner (block or defining op). This
+    // ensures we track one canonical per owner group.
+    using MapKey = std::pair<Signature, void *>;
+    auto mapKeyLess = [&](const MapKey &a, const MapKey &b) {
+      if (a.second != b.second)
+        return a.second < b.second;
+      return std::lexicographical_compare(a.first.begin(), a.first.end(),
+                                          b.first.begin(), b.first.end(),
+                                          sigEntryLess);
+    };
+    std::map<MapKey, Value, decltype(mapKeyLess)> signatureToCanonical(
+        mapKeyLess);
     bool changed = false;
-    unsigned numInputs = inputs.size();
-    for (auto i : llvm::seq<unsigned>(0, numInputs)) {
-      Value input1 = inputs[i];
-      for (auto j : llvm::seq<unsigned>(i + 1, numInputs)) {
-        Value input2 = inputs[j];
-        // Nothing to do if input2 is already dead.
-        if (input2.use_empty())
+    // Total complexity: O(n * k * max(log k, log n)). For each input, sorting
+    // the signature costs O(k log k) and the std::map lookup costs O(k log n).
+    for (Value input : inputs) {
+      // Gather the predecessor value for each predecessor (region branch
+      // point) and sort them to form this input's signature.
+      Signature sig;
+      for (OpOperand *operand : inputsToOperands[input])
+        sig.emplace_back(operand->getOwner(), operand->get());
+      llvm::sort(sig, sigEntryLess);
+
+      void *owner = getOwnerOfValue(input);
+
+      auto [it, inserted] = signatureToCanonical.try_emplace(
+          MapKey{std::move(sig), owner}, input);
+      if (!inserted) {
+        Value canonical = it->second;
+        // Nothing to do if input is already dead.
+        if (input.use_empty())
           continue;
-        // Replace only values that belong to the same block / operation.
-        // This implies that the two values are either both block arguments or
-        // both op results.
-        if (!haveSameOwner(input1, input2))
-          continue;
-
-        // Gather the predecessor value for each predecessor (region branch
-        // point). The two inputs are duplicates if each predecessor forwards
-        // the same value.
-        llvm::SmallDenseMap<Operation *, Value> operands1, operands2;
-        for (OpOperand *operand : inputsToOperands[input1]) {
-          assert(!operands1.contains(operand->getOwner()));
-          operands1[operand->getOwner()] = operand->get();
-        }
-        for (OpOperand *operand : inputsToOperands[input2]) {
-          assert(!operands2.contains(operand->getOwner()));
-          operands2[operand->getOwner()] = operand->get();
-        }
-        if (operands1 == operands2) {
-          rewriter.replaceAllUsesWith(input2, input1);
-          changed = true;
-        }
+        rewriter.replaceAllUsesWith(input, canonical);
+        changed = true;
       }
     }
     return success(changed);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 21f7954fd338a..e60a682b42ea1 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Interfaces/CallInterfaces.h"
@@ -215,6 +216,54 @@ convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
   return success();
 }
 
+/// Recursively converts an MLIR metadata attribute to an LLVM metadata node.
+static llvm::Metadata *
+convertMetadataAttr(Attribute attr, llvm::IRBuilderBase &builder,
+                    LLVM::ModuleTranslation &moduleTranslation) {
+  return llvm::TypeSwitch<Attribute, llvm::Metadata *>(attr)
+      .Case<LLVM::MDStringAttr>([&](auto a) -> llvm::Metadata * {
+        return llvm::MDString::get(builder.getContext(),
+                                   a.getValue().getValue());
+      })
+      .Case<LLVM::MDConstantAttr>([&](auto a) -> llvm::Metadata * {
+        IntegerAttr intAttr = llvm::dyn_cast<IntegerAttr>(a.getValue());
+        if (!intAttr)
+          return nullptr;
+        return llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+            llvm::Type::getIntNTy(builder.getContext(),
+                                  intAttr.getType().getIntOrFloatBitWidth()),
+            intAttr.getValue()));
+      })
+      .Case<LLVM::MDFuncAttr>([&](auto a) -> llvm::Metadata * {
+        if (llvm::Function *fn =
+                moduleTranslation.lookupFunction(a.getName().getValue()))
+          return llvm::ValueAsMetadata::get(fn);
+        return nullptr;
+      })
+      .Case<LLVM::MDNodeAttr>([&](auto a) -> llvm::Metadata * {
+        SmallVector<llvm::Metadata *> operands;
+        for (Attribute op : a.getOperands())
+          operands.push_back(
+              convertMetadataAttr(op, builder, moduleTranslation));
+        return llvm::MDNode::get(builder.getContext(), operands);
+      })
+      .Default([](auto) -> llvm::Metadata * { return nullptr; });
+}
+
+static void convertNamedMetadataOp(StringRef metadataName, ArrayAttr nodes,
+                                   llvm::IRBuilderBase &builder,
+                                   LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::Module *llvmModule = moduleTranslation.getLLVMModule();
+  llvm::NamedMDNode *namedMD =
+      llvmModule->getOrInsertNamedMetadata(metadataName);
+  for (Attribute nodeAttr : nodes) {
+    llvm::Metadata *md =
+        convertMetadataAttr(nodeAttr, builder, moduleTranslation);
+    if (auto *mdNode = llvm::dyn_cast_or_null<llvm::MDNode>(md))
+      namedMD->addOperand(mdNode);
+  }
+}
+
 static void convertLinkerOptionsOp(ArrayAttr options,
                                    llvm::IRBuilderBase &builder,
                                    LLVM::ModuleTranslation &moduleTranslation) {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 9d7c0003c2336..37b1a37c2e1a5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -321,10 +321,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
                           << " operation";
   };
 
-  auto checkAffinity = [&todo](auto op, LogicalResult &result) {
-    if (!op.getAffinityVars().empty())
-      result = todo("affinity");
-  };
   auto checkAllocate = [&todo](auto op, LogicalResult &result) {
     if (!op.getAllocateVars().empty() || !op.getAllocatorVars().empty())
       result = todo("allocate");
@@ -408,7 +404,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkThreadLimit(op, result);
       })
       .Case([&](omp::TaskOp op) {
-        checkAffinity(op, result);
         checkAllocate(op, result);
         checkInReduction(op, result);
       })
@@ -2233,6 +2228,81 @@ class TaskContextStructManager {
   /// The type of the structure
   llvm::Type *structTy = nullptr;
 };
+
+/// IteratorInfo extracts and prepares loop bounds information from an
+/// mlir::omp::IteratorOp for lowering to LLVM IR.
+///
+/// It computes the per-dimension trip counts and the total linearized trip
+/// count, casted to i64. These are used to build a canonical loop and to
+/// reconstruct the physical induction variables inside the loop body.
+class IteratorInfo {
+private:
+  llvm::SmallVector<llvm::Value *> lowerBounds;
+  llvm::SmallVector<llvm::Value *> upperBounds;
+  llvm::SmallVector<llvm::Value *> steps;
+  llvm::SmallVector<llvm::Value *> trips;
+  unsigned dims;
+  llvm::Value *totalTrips;
+
+  llvm::Value *lookUpAsI64(mlir::Value val, const LLVM::ModuleTranslation &mt,
+                           llvm::IRBuilderBase &builder) {
+    llvm::Value *v = mt.lookupValue(val);
+    if (!v)
+      return nullptr;
+    if (v->getType()->isIntegerTy(64))
+      return v;
+    if (v->getType()->isIntegerTy())
+      return builder.CreateSExtOrTrunc(v, builder.getInt64Ty());
+    return nullptr;
+  }
+
+public:
+  IteratorInfo(mlir::omp::IteratorOp itersOp,
+               mlir::LLVM::ModuleTranslation &moduleTranslation,
+               llvm::IRBuilderBase &builder) {
+    dims = itersOp.getLoopLowerBounds().size();
+    lowerBounds.resize(dims);
+    upperBounds.resize(dims);
+    steps.resize(dims);
+    trips.resize(dims);
+
+    for (unsigned d = 0; d < dims; ++d) {
+      llvm::Value *lb = lookUpAsI64(itersOp.getLoopLowerBounds()[d],
+                                    moduleTranslation, builder);
+      llvm::Value *ub = lookUpAsI64(itersOp.getLoopUpperBounds()[d],
+                                    moduleTranslation, builder);
+      llvm::Value *st =
+          lookUpAsI64(itersOp.getLoopSteps()[d], moduleTranslation, builder);
+      assert(lb && ub && st &&
+             "Expect lowerBounds, upperBounds, and steps in IteratorOp");
+      assert((!llvm::isa<llvm::ConstantInt>(st) ||
+              !llvm::cast<llvm::ConstantInt>(st)->isZero()) &&
+             "Expect non-zero step in IteratorOp");
+
+      lowerBounds[d] = lb;
+      upperBounds[d] = ub;
+      steps[d] = st;
+
+      // trips = ((ub - lb) / step) + 1  (inclusive ub, assume positive step)
+      llvm::Value *diff = builder.CreateSub(ub, lb);
+      llvm::Value *div = builder.CreateSDiv(diff, st);
+      trips[d] = builder.CreateAdd(
+          div, llvm::ConstantInt::get(builder.getInt64Ty(), 1));
+    }
+
+    totalTrips = llvm::ConstantInt::get(builder.getInt64Ty(), 1);
+    for (unsigned d = 0; d < dims; ++d)
+      totalTrips = builder.CreateMul(totalTrips, trips[d]);
+  }
+
+  unsigned getDims() const { return dims; }
+  llvm::ArrayRef<llvm::Value *> getLowerBounds() const { return lowerBounds; }
+  llvm::ArrayRef<llvm::Value *> getUpperBounds() const { return upperBounds; }
+  llvm::ArrayRef<llvm::Value *> getSteps() const { return steps; }
+  llvm::ArrayRef<llvm::Value *> getTrips() const { return trips; }
+  llvm::Value *getTotalTrips() const { return totalTrips; }
+};
+
 } // namespace
 
 void TaskContextStructManager::generateTaskContextStruct() {
@@ -2307,6 +2377,235 @@ void TaskContextStructManager::freeStructPtr() {
   builder.CreateFree(structPtr);
 }
 
+static void storeAffinityEntry(llvm::IRBuilderBase &builder,
+                               llvm::OpenMPIRBuilder &ompBuilder,
+                               llvm::Value *affinityList, llvm::Value *index,
+                               llvm::Value *addr, llvm::Value *len) {
+  llvm::StructType *kmpTaskAffinityInfoTy =
+      ompBuilder.getKmpTaskAffinityInfoTy();
+  llvm::Value *entry = builder.CreateInBoundsGEP(
+      kmpTaskAffinityInfoTy, affinityList, index, "omp.affinity.entry");
+
+  addr = builder.CreatePtrToInt(addr, kmpTaskAffinityInfoTy->getElementType(0));
+  len = builder.CreateIntCast(len, kmpTaskAffinityInfoTy->getElementType(1),
+                              /*isSigned=*/false);
+  llvm::Value *flags = builder.getInt32(0);
+
+  builder.CreateStore(addr,
+                      builder.CreateStructGEP(kmpTaskAffinityInfoTy, entry, 0));
+  builder.CreateStore(len,
+                      builder.CreateStructGEP(kmpTaskAffinityInfoTy, entry, 1));
+  builder.CreateStore(flags,
+                      builder.CreateStructGEP(kmpTaskAffinityInfoTy, entry, 2));
+}
+
+static void fillAffinityLocators(Operation::operand_range affinityVars,
+                                 llvm::IRBuilderBase &builder,
+                                 LLVM::ModuleTranslation &moduleTranslation,
+                                 llvm::Value *affinityList) {
+  for (auto [i, affinityVar] : llvm::enumerate(affinityVars)) {
+    auto entryOp = affinityVar.getDefiningOp<mlir::omp::AffinityEntryOp>();
+    assert(entryOp && "affinity item must be omp.affinity_entry");
+
+    llvm::Value *addr = moduleTranslation.lookupValue(entryOp.getAddr());
+    llvm::Value *len = moduleTranslation.lookupValue(entryOp.getLen());
+    assert(addr && len && "expect affinity addr and len to be non-null");
+    storeAffinityEntry(builder, *moduleTranslation.getOpenMPBuilder(),
+                       affinityList, builder.getInt64(i), addr, len);
+  }
+}
+
+static mlir::LogicalResult
+convertIteratorRegion(llvm::Value *linearIV, IteratorInfo &iterInfo,
+                      mlir::Block &iteratorRegionBlock,
+                      llvm::IRBuilderBase &builder,
+                      LLVM::ModuleTranslation &moduleTranslation) {
+  llvm::Value *tmp = linearIV;
+  for (int d = (int)iterInfo.getDims() - 1; d >= 0; --d) {
+    llvm::Value *trip = iterInfo.getTrips()[d];
+    // idx_d = tmp % trip_d
+    llvm::Value *idx = builder.CreateURem(tmp, trip);
+    // tmp = tmp / trip_d
+    tmp = builder.CreateUDiv(tmp, trip);
+
+    // physIV_d = lb_d + idx_d * step_d
+    llvm::Value *physIV = builder.CreateAdd(
+        iterInfo.getLowerBounds()[d],
+        builder.CreateMul(idx, iterInfo.getSteps()[d]), "omp.it.phys_iv");
+
+    moduleTranslation.mapValue(iteratorRegionBlock.getArgument(d), physIV);
+  }
+
+  // Translate the iterator region into the loop body.
+  moduleTranslation.mapBlock(&iteratorRegionBlock, builder.GetInsertBlock());
+  if (mlir::failed(moduleTranslation.convertBlock(iteratorRegionBlock,
+                                                  /*ignoreArguments=*/true,
+                                                  builder))) {
+    return mlir::failure();
+  }
+  return mlir::success();
+}
+
+static mlir::LogicalResult
+fillAffinityIteratorLoop(mlir::omp::IteratorOp itersOp,
+                         llvm::IRBuilderBase &builder,
+                         mlir::LLVM::ModuleTranslation &moduleTranslation,
+                         llvm::Value *affinityList, IteratorInfo &iterInfo) {
+  mlir::Region &itersRegion = itersOp.getRegion();
+  mlir::Block &iteratorRegionBlock = itersRegion.front();
+
+  llvm::OpenMPIRBuilder::LocationDescription loc(builder);
+
+  auto bodyGen = [&](llvm::OpenMPIRBuilder::InsertPointTy bodyIP,
+                     llvm::Value *linearIV) -> llvm::Error {
+    llvm::IRBuilderBase::InsertPointGuard guard(builder);
+    builder.restoreIP(bodyIP);
+
+    if (failed(convertIteratorRegion(linearIV, iterInfo, iteratorRegionBlock,
+                                     builder, moduleTranslation))) {
+      return llvm::make_error<llvm::StringError>(
+          "failed to convert iterator region", llvm::inconvertibleErrorCode());
+    }
+
+    // Extract affinity entry from omp.yield and store into list[linearIV].
+    auto yield =
+        mlir::dyn_cast<mlir::omp::YieldOp>(iteratorRegionBlock.getTerminator());
+    assert(yield && yield.getResults().size() == 1 &&
+           "expect omp.yield in iterator region to have one result");
+    auto entryOp =
+        yield.getResults()[0].getDefiningOp<mlir::omp::AffinityEntryOp>();
+    assert(entryOp && "expect yield generate an affinity entry");
+
+    llvm::Value *addr = moduleTranslation.lookupValue(entryOp.getAddr());
+    llvm::Value *len = moduleTranslation.lookupValue(entryOp.getLen());
+    storeAffinityEntry(builder, *moduleTranslation.getOpenMPBuilder(),
+                       affinityList, linearIV, addr, len);
+
+    // Iterator-region block/value mappings are temporary for this conversion,
+    // clear them to avoid stale entries in ModuleTranslation.
+    moduleTranslation.forgetMapping(itersRegion);
+
+    return llvm::Error::success();
+  };
+
+  llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
+      moduleTranslation.getOpenMPBuilder()->createIteratorLoop(
+          loc, iterInfo.getTotalTrips(), bodyGen,
+          /*Name=*/"iterator");
+  if (failed(handleError(afterIP, *itersOp)))
+    return failure();
+
+  builder.restoreIP(*afterIP);
+
+  return mlir::success();
+}
+
+static mlir::LogicalResult
+buildAffinityData(mlir::omp::TaskOp &taskOp, llvm::IRBuilderBase &builder,
+                  mlir::LLVM::ModuleTranslation &moduleTranslation,
+                  llvm::OpenMPIRBuilder::AffinityData &ad) {
+
+  if (taskOp.getAffinityVars().empty() && taskOp.getIterated().empty()) {
+    ad.Count = nullptr;
+    ad.Info = nullptr;
+    return mlir::success();
+  }
+
+  llvm::SmallVector<llvm::OpenMPIRBuilder::AffinityData> ads;
+  llvm::StructType *kmpTaskAffinityInfoTy =
+      moduleTranslation.getOpenMPBuilder()->getKmpTaskAffinityInfoTy();
+
+  auto allocateAffinityList = [&](llvm::Value *count) -> llvm::Value * {
+    llvm::IRBuilderBase::InsertPointGuard guard(builder);
+    if (llvm::isa<llvm::Constant>(count) || llvm::isa<llvm::Argument>(count))
+      builder.restoreIP(findAllocaInsertPoint(builder, moduleTranslation));
+    return builder.CreateAlloca(kmpTaskAffinityInfoTy, count,
+                                "omp.affinity_list");
+  };
+
+  auto createAffinity =
+      [&](llvm::Value *count,
+          llvm::Value *info) -> llvm::OpenMPIRBuilder::AffinityData {
+    llvm::OpenMPIRBuilder::AffinityData ad{};
+    ad.Count = builder.CreateTrunc(count, builder.getInt32Ty());
+    ad.Info =
+        builder.CreatePointerBitCastOrAddrSpaceCast(info, builder.getPtrTy(0));
+    return ad;
+  };
+
+  if (!taskOp.getAffinityVars().empty()) {
+    llvm::Value *count = llvm::ConstantInt::get(
+        builder.getInt64Ty(), taskOp.getAffinityVars().size());
+    llvm::Value *list = allocateAffinityList(count);
+    fillAffinityLocators(taskOp.getAffinityVars(), builder, moduleTranslation,
+                         list);
+    ads.emplace_back(createAffinity(count, list));
+  }
+
+  if (!taskOp.getIterated().empty()) {
+    for (auto [i, iter] : llvm::enumerate(taskOp.getIterated())) {
+      auto itersOp = iter.getDefiningOp<omp::IteratorOp>();
+      assert(itersOp && "iterated value must be defined by omp.iterator");
+      IteratorInfo iterInfo(itersOp, moduleTranslation, builder);
+      llvm::Value *affList = allocateAffinityList(iterInfo.getTotalTrips());
+      if (failed(fillAffinityIteratorLoop(itersOp, builder, moduleTranslation,
+                                          affList, iterInfo)))
+        return llvm::failure();
+      ads.emplace_back(createAffinity(iterInfo.getTotalTrips(), affList));
+    }
+  }
+
+  llvm::Value *totalAffinityCount = builder.getInt32(0);
+  for (const auto &affinity : ads)
+    totalAffinityCount = builder.CreateAdd(
+        totalAffinityCount,
+        builder.CreateIntCast(affinity.Count, builder.getInt32Ty(),
+                              /*isSigned=*/false));
+
+  llvm::Value *affinityInfo = ads.front().Info;
+  if (ads.size() > 1) {
+    llvm::StructType *kmpTaskAffinityInfoTy =
+        moduleTranslation.getOpenMPBuilder()->getKmpTaskAffinityInfoTy();
+    llvm::Value *affinityInfoElemSize = builder.getInt64(
+        moduleTranslation.getLLVMModule()->getDataLayout().getTypeAllocSize(
+            kmpTaskAffinityInfoTy));
+
+    llvm::Value *packedAffinityInfo = allocateAffinityList(totalAffinityCount);
+    llvm::Value *packedAffinityInfoOffset = builder.getInt32(0);
+    for (const auto &affinity : ads) {
+      llvm::Value *affinityCount = builder.CreateIntCast(
+          affinity.Count, builder.getInt32Ty(), /*isSigned=*/false);
+      llvm::Value *affinityCountInt64 = builder.CreateIntCast(
+          affinityCount, builder.getInt64Ty(), /*isSigned=*/false);
+      llvm::Value *affinityInfoSize =
+          builder.CreateMul(affinityCountInt64, affinityInfoElemSize);
+
+      llvm::Value *packedAffinityInfoIndex = builder.CreateIntCast(
+          packedAffinityInfoOffset, kmpTaskAffinityInfoTy->getElementType(0),
+          /*isSigned=*/false);
+      packedAffinityInfoIndex = builder.CreateInBoundsGEP(
+          kmpTaskAffinityInfoTy, packedAffinityInfo, packedAffinityInfoIndex);
+
+      builder.CreateMemCpy(
+          packedAffinityInfoIndex, llvm::Align(1),
+          builder.CreatePointerBitCastOrAddrSpaceCast(
+              affinity.Info, builder.getPtrTy(packedAffinityInfoIndex->getType()
+                                                  ->getPointerAddressSpace())),
+          llvm::Align(1), affinityInfoSize);
+
+      packedAffinityInfoOffset =
+          builder.CreateAdd(packedAffinityInfoOffset, affinityCount);
+    }
+
+    affinityInfo = packedAffinityInfo;
+  }
+
+  ad.Count = totalAffinityCount;
+  ad.Info = affinityInfo;
+
+  return mlir::success();
+}
+
 /// Converts an OpenMP task construct into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
@@ -2421,6 +2720,10 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
           taskOp.getPrivateNeedsBarrier())))
     return llvm::failure();
 
+  llvm::OpenMPIRBuilder::AffinityData ad;
+  if (failed(buildAffinityData(taskOp, builder, moduleTranslation, ad)))
+    return llvm::failure();
+
   // Set up for call to createTask()
   builder.SetInsertPoint(taskStartBlock);
 
@@ -2524,7 +2827,7 @@ convertOmpTaskOp(omp::TaskOp taskOp, llvm::IRBuilderBase &builder,
       moduleTranslation.getOpenMPBuilder()->createTask(
           ompLoc, allocaIP, bodyCB, !taskOp.getUntied(),
           moduleTranslation.lookupValue(taskOp.getFinal()),
-          moduleTranslation.lookupValue(taskOp.getIfExpr()), dds,
+          moduleTranslation.lookupValue(taskOp.getIfExpr()), dds, ad,
           taskOp.getMergeable(),
           moduleTranslation.lookupValue(taskOp.getEventHandle()),
           moduleTranslation.lookupValue(taskOp.getPriority()));
@@ -7321,13 +7624,13 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
           .Case([&](omp::LoopNestOp) {
             return convertOmpLoopNest(*op, builder, moduleTranslation);
           })
-          .Case<omp::MapInfoOp, omp::MapBoundsOp, omp::PrivateClauseOp>(
-              [&](auto op) {
-                // No-op, should be handled by relevant owning operations e.g.
-                // TargetOp, TargetEnterDataOp, TargetExitDataOp, TargetDataOp
-                // etc. and then discarded
-                return success();
-              })
+          .Case<omp::MapInfoOp, omp::MapBoundsOp, omp::PrivateClauseOp,
+                omp::AffinityEntryOp, omp::IteratorOp>([&](auto op) {
+            // No-op, should be handled by relevant owning operations e.g.
+            // TargetOp, TargetEnterDataOp, TargetExitDataOp, TargetDataOp
+            // etc. and then discarded
+            return success();
+          })
           .Case([&](omp::NewCliOp op) {
             // Meta-operation: Doesn't do anything by itself, but used to
             // identify a loop.
diff --git a/mlir/python/mlir/dialects/ext.py b/mlir/python/mlir/dialects/ext.py
index 15651a1c4e858..867da6ee96637 100644
--- a/mlir/python/mlir/dialects/ext.py
+++ b/mlir/python/mlir/dialects/ext.py
@@ -34,29 +34,12 @@
     "Region",
     "Type",
     "Attribute",
-    "register_dialect",
-    "register_operation",
 ]
 
 Operand = ir.Value
 Result = ir.OpResult
 Region = ir.Region
 
-register_dialect = _cext.register_dialect
-
-
-def register_operation(
-    dialect_cls: type, *, replace: bool = False
-) -> Callable[[type], type]:
-    register = _cext.register_operation(dialect_cls, replace=replace)
-
-    def decorator(op_cls: type) -> type:
-        register(op_cls)
-        _cext.register_op_adaptor(op_cls, replace=replace)(op_cls.Adaptor)
-        return op_cls
-
-    return decorator
-
 
 def construct_instance(origin, args):
     # `origin.get` is to construct an instance of MLIR type or attribute.
@@ -816,11 +799,14 @@ def _emit_module(cls) -> ir.Module:
     def load(
         cls,
         *,
-        register: bool = True,
         reload: bool = False,
-        replace: bool = False,
     ) -> None:
         if hasattr(cls, "_mlir_module") and not reload:
+            if cls._mlir_module.context is not ir.Context.current:
+                raise RuntimeError(
+                    "This dialect was loaded in a different context. "
+                    "Please set reload=True to reload the dialect in the current context."
+                )
             return
 
         cls._mlir_module = cls._emit_module()
@@ -833,17 +819,16 @@ def load(
         for op in cls.operations:
             op._attach_traits()
 
+        _cext.globals._register_dialect_impl(cls.DIALECT_NAMESPACE, cls, replace=reload)
+
         for type_ in cls.types:
             typeid = ir.DynamicType.lookup_typeid(type_.type_name)
-            _cext.register_type_caster(typeid, replace=replace)(type_)
+            _cext.register_type_caster(typeid, replace=reload)(type_)
 
         for attr in cls.attributes:
             typeid = ir.DynamicAttr.lookup_typeid(attr.attr_name)
-            _cext.register_type_caster(typeid, replace=replace)(attr)
-
-        if register:
-            register_dialect(cls)
+            _cext.register_type_caster(typeid, replace=reload)(attr)
 
-            register_dialect_operation = register_operation(cls, replace=replace)
-            for op in cls.operations:
-                register_dialect_operation(op)
+        for op in cls.operations:
+            _cext.register_operation(cls, replace=reload)(op)
+            _cext.register_op_adaptor(op, replace=reload)(op.Adaptor)
diff --git a/mlir/python/mlir/dialects/llvm.py b/mlir/python/mlir/dialects/llvm.py
index 1fd7e64251e61..23ed23997c3ba 100644
--- a/mlir/python/mlir/dialects/llvm.py
+++ b/mlir/python/mlir/dialects/llvm.py
@@ -6,7 +6,7 @@
 from ._llvm_ops_gen import _Dialect
 from ._llvm_enum_gen import *
 from .._mlir_libs._mlirDialectsLLVM import *
-from ..ir import Value
+from ..ir import Value, IntegerType, IntegerAttr
 from ._ods_common import get_op_result_or_op_results as _get_op_result_or_op_results
 
 
@@ -14,3 +14,16 @@ def mlir_constant(value, *, loc=None, ip=None) -> Value:
     return _get_op_result_or_op_results(
         ConstantOp(res=value.type, value=value, loc=loc, ip=ip)
     )
+
+
+def md_const(val, *, width=32, context=None):
+    if not isinstance(val, int):
+        raise NotImplementedError(
+            f"{val=} not supported; only integers currently supported."
+        )
+    i_type = IntegerType.get_signless(width, context=context)
+    return MDConstantAttr.get(IntegerAttr.get(i_type, val), context=context)
+
+
+def md_str(s, *, context=None):
+    return MDStringAttr.get(s, context=context)
diff --git a/mlir/test/Bytecode/invalid/invalid-dense-elem-type-interface.mlir b/mlir/test/Bytecode/invalid/invalid-dense-elem-type-interface.mlir
deleted file mode 100644
index f076dcb9b2f1f..0000000000000
--- a/mlir/test/Bytecode/invalid/invalid-dense-elem-type-interface.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: not mlir-opt %s --test-bytecode-roundtrip="test-kind=2" 2>&1 | FileCheck %s
-
-// Regression test: test-kind=2 replaces i32 with !test.i32 (a type that does
-// not implement DenseElementTypeInterface). This should produce a proper error
-// instead of an assertion failure when deserializing DenseTypedElementsAttr.
-
-// CHECK: DenseTypedElementsAttr element type must implement DenseElementTypeInterface, but got: '!test.i32'
-// CHECK: failed to read bytecode
-
-module {
-  func.func @test() -> tensor<10xi32> {
-    %0 = arith.constant dense<42> : tensor<10xi32>
-    return %0 : tensor<10xi32>
-  }
-}
diff --git a/mlir/test/Bytecode/invalid/invalid-type-remapping.mlir b/mlir/test/Bytecode/invalid/invalid-type-remapping.mlir
new file mode 100644
index 0000000000000..44d0a4eb8bb4a
--- /dev/null
+++ b/mlir/test/Bytecode/invalid/invalid-type-remapping.mlir
@@ -0,0 +1,55 @@
+// RUN: not mlir-opt %s -split-input-file --test-bytecode-roundtrip="test-kind=2" 2>&1 | FileCheck %s
+
+// Tests that proper errors are emitted (rather than crashes) when the type
+// callback replaces types with ones that are incompatible with built-in types
+// and attributes (test-kind=2 replaces i32 with !test.i32).
+
+// CHECK: expected integer or index type for IntegerAttr, but got: '!test.i32'
+// CHECK: failed to read bytecode
+// IntegerAttr whose type is replaced by one that is neither IntegerType nor
+// IndexType — previously crashed with an APInt assertion.
+module {
+  func.func @integer_attr_unsupported_type() {
+    %c = arith.constant 1 : i32
+    return
+  }
+}
+
+// -----
+
+// CHECK: failed to verify 'elementType': VectorElementTypeInterface instance
+// CHECK: failed to read bytecode
+// Fixed-size VectorType whose element type is replaced by one that does not
+// implement VectorElementTypeInterface — previously crashed in VectorType::get.
+module {
+  func.func @vector_unsupported_elem_type() {
+    %cst = arith.constant dense<42> : vector<3xi32>
+    return
+  }
+}
+
+// -----
+
+// CHECK: failed to verify 'elementType': VectorElementTypeInterface instance
+// CHECK: failed to read bytecode
+// Scalable VectorType whose element type is replaced by one that does not
+// implement VectorElementTypeInterface — exercises the VectorTypeWithScalableDims
+// bytecode path.
+module {
+  func.func @scalable_vector_unsupported_elem_type(%v : vector<[3]xi32>) {
+    return
+  }
+}
+
+// -----
+
+// CHECK: DenseTypedElementsAttr element type must implement DenseElementTypeInterface, but got: '!test.i32'
+// CHECK: failed to read bytecode
+// DenseTypedElementsAttr whose element type is replaced by one that does not
+// implement DenseElementTypeInterface — previously crashed with an assertion.
+module {
+  func.func @dense_elem_unsupported_type() -> tensor<10xi32> {
+    %0 = arith.constant dense<42> : tensor<10xi32>
+    return %0 : tensor<10xi32>
+  }
+}
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
index 49c55f5b54496..076209cbc7a4c 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm-interface.mlir
@@ -2066,6 +2066,20 @@ func.func @gather_with_alignment(%arg0: memref<?xf32>, %arg1: vector<3xi32>, %ar
 
 // -----
 
+// TODO: Implement this lowering.
+func.func @negative_gather_on_strided_memref(%arg0: memref<?xf32, strided<[2], offset: ?>>, %arg1: vector<3xi32>, %arg2: vector<3xi1>, %arg3: vector<3xf32>) -> vector<3xf32> {
+  %0 = arith.constant 0: index
+  %1 = vector.gather %arg0[%0][%arg1], %arg2, %arg3
+    : memref<?xf32, strided<[2], offset: ?>>, vector<3xi32>, vector<3xi1>, vector<3xf32> into vector<3xf32>
+  return %1 : vector<3xf32>
+}
+
+// CHECK-LABEL: func @negative_gather_on_strided_memref
+// CHECK-NOT: llvm.intr.masked.gather
+// CHECK: vector.gather
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // vector.scatter
 //===----------------------------------------------------------------------===//
@@ -2152,6 +2166,19 @@ func.func @scatter_with_alignment(%arg0: memref<?xf32>, %arg1: vector<3xi32>, %a
 // CHECK-LABEL: func @scatter_with_alignment
 // CHECK: llvm.intr.masked.scatter %{{.*}}, %{{.*}}, %{{.*}} {alignment = 8 : i32} : vector<3xf32>, vector<3xi1> into vector<3x!llvm.ptr>
 
+// -----
+
+// TODO: Implement this lowering.
+func.func @negative_scatter_on_strided_memref(%arg0: memref<?xf32, strided<[2], offset: ?>>, %arg1: vector<3xi32>, %arg2: vector<3xi1>, %arg3: vector<3xf32>) {
+  %0 = arith.constant 0: index
+  vector.scatter %arg0[%0][%arg1], %arg2, %arg3
+    : memref<?xf32, strided<[2], offset: ?>>, vector<3xi32>, vector<3xi1>, vector<3xf32>
+  return
+}
+
+// CHECK-LABEL: func @negative_scatter_on_strided_memref
+// CHECK-NOT: llvm.intr.masked.scatter
+// CHECK: vector.scatter
 
 // -----
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-call-copy-before-write.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-call-copy-before-write.mlir
new file mode 100644
index 0000000000000..7addca2c9d6a5
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-call-copy-before-write.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries=1 copy-before-write=1" | FileCheck %s
+
+// Regression test for https://github.com/llvm/llvm-project/issues/163052
+// copy-before-write=1 + bufferize-function-boundaries=1 with a call to a
+// private (declaration-only) function used to crash with a stack overflow due
+// to an invalid cast of AnalysisState to OneShotAnalysisState inside
+// getCalledFunction().
+
+// CHECK-LABEL: func.func private @callee(memref<64xf32
+// CHECK-LABEL: func.func @caller
+// CHECK:         call @callee
+func.func private @callee(tensor<64xf32>)
+func.func @caller(%A : tensor<64xf32>) {
+  call @callee(%A) : (tensor<64xf32>) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index f8e75cec9b7cb..bf862b2c5ae3c 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -190,15 +190,31 @@ module attributes {gpu.container_module} {
 
 // -----
 
+module attributes {gpu.container_module} {
+  gpu.module @kernels_container {
+    gpu.func @kernel_1(%arg1 : !llvm.ptr) kernel {
+      gpu.return
+    }
+  }
+
+  func.func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm.ptr) {
+    // expected-error at +1 {{kernel container 'kernels' is undefined}}
+    gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) args(%arg : !llvm.ptr)
+    return
+  }
+}
+
+// -----
+
 module attributes {gpu.container_module} {
   module @kernels {
+    // expected-error at +1 {{'gpu.func' op expects parent op 'gpu.module'}}
     gpu.func @kernel_1(%arg1 : !llvm.ptr) kernel {
       gpu.return
     }
   }
 
   func.func @launch_func_missing_kernel_attr(%sz : index, %arg : !llvm.ptr) {
-    // expected-error at +1 {{kernel module 'kernels' is undefined}}
     gpu.launch_func @kernels::@kernel_1 blocks in (%sz, %sz, %sz) threads in (%sz, %sz, %sz) args(%arg : !llvm.ptr)
     return
   }
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 437d06841f7e4..500988ea894de 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -1117,3 +1117,39 @@ llvm.func @escapedtypename() {
   %1 = llvm.alloca %0 x !llvm.struct<"bucket<string, double, '\\b'>::Iterator", (ptr, i64, i64)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
   llvm.return
 }
+
+// Metadata attributes and llvm.named_metadata op.
+
+llvm.func @md_kernel() {
+  llvm.return
+}
+
+// CHECK: llvm.named_metadata "foo.version" [#llvm.md_node<#llvm.md_const<1 : i32>, #llvm.md_const<0 : i32>, #llvm.md_const<0 : i32>>]
+llvm.named_metadata "foo.version" [
+  #llvm.md_node<
+    #llvm.md_const<1 : i32>,
+    #llvm.md_const<0 : i32>,
+    #llvm.md_const<0 : i32>
+  >
+]
+
+// CHECK: llvm.named_metadata "foo.language" [#llvm.md_node<#llvm.md_string<"Bar">, #llvm.md_const<1 : i32>, #llvm.md_const<2 : i32>>]
+llvm.named_metadata "foo.language" [
+  #llvm.md_node<
+    #llvm.md_string<"Bar">,
+    #llvm.md_const<1 : i32>,
+    #llvm.md_const<2 : i32>
+  >
+]
+
+// CHECK: llvm.named_metadata "foo.kernel" [#llvm.md_node<#llvm.md_func<@md_kernel>, #llvm.md_node<>, #llvm.md_node<#llvm.md_const<0 : i32>, #llvm.md_string<"foo.buffer">>>]
+llvm.named_metadata "foo.kernel" [
+  #llvm.md_node<
+    #llvm.md_func<@md_kernel>,
+    #llvm.md_node<>,
+    #llvm.md_node<
+      #llvm.md_const<0 : i32>,
+      #llvm.md_string<"foo.buffer">
+    >
+  >
+]
diff --git a/mlir/test/Dialect/Linalg/elementwise/fold.mlir b/mlir/test/Dialect/Linalg/elementwise/fold.mlir
index e83c32fb6a2cf..80fd90f3d4dbe 100644
--- a/mlir/test/Dialect/Linalg/elementwise/fold.mlir
+++ b/mlir/test/Dialect/Linalg/elementwise/fold.mlir
@@ -9,11 +9,11 @@
 // CHECK-SAME:       ins(%[[A]] : tensor<16x8x32xf32>) outs(%[[B]] : tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
 // CHECK-NEXT:    return %[[RES]] : tensor<8x16x32xf32>
 //
-func.func @unary_transpose(%A : tensor<16x8x32xf32>, %B: tensor<8x16x32xf32>) ->  tensor<8x16x32xf32> {
+func.func @unary_transpose(%A: tensor<16x8x32xf32>, %B: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
   %empty = tensor.empty() : tensor<8x16x32xf32>
-  %transposed_A = linalg.transpose ins(%A : tensor<16x8x32xf32>) outs(%empty :  tensor<8x16x32xf32>) permutation = [1, 0, 2]
+  %transposed_A = linalg.transpose ins(%A : tensor<16x8x32xf32>) outs(%empty : tensor<8x16x32xf32>) permutation = [1, 0, 2]
   %result = linalg.elementwise kind=#linalg.elementwise_kind<exp>
-                          ins(%transposed_A : tensor<8x16x32xf32>) outs(%B: tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+                          ins(%transposed_A : tensor<8x16x32xf32>) outs(%B : tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
   return %result : tensor<8x16x32xf32>
 }
 
@@ -28,16 +28,220 @@ func.func @unary_transpose(%A : tensor<16x8x32xf32>, %B: tensor<8x16x32xf32>) ->
 // CHECK-SAME:              ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[C]] : tensor<?x?xf32>) -> tensor<?x?xf32>
 // CHECK-NEXT:  return %[[RES]] : tensor<?x?xf32>
 //
-func.func @binary_transposed(%A : tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) ->  tensor<?x?xf32> {
+func.func @binary_transposed(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %dim0 = tensor.dim %A, %c0 : tensor<?x?xf32>
   %dim1 = tensor.dim %A, %c1 : tensor<?x?xf32>
 
   %empty = tensor.empty(%dim1, %dim0) : tensor<?x?xf32>
-  %transposed_B = linalg.transpose ins(%B : tensor<?x?xf32>) outs(%empty :  tensor<?x?xf32>) permutation = [1, 0]
+  %transposed_B = linalg.transpose ins(%B : tensor<?x?xf32>) outs(%empty : tensor<?x?xf32>) permutation = [1, 0]
   %result = linalg.elementwise kind=#linalg.elementwise_kind<add>
-                          ins(%A, %transposed_B : tensor<?x?xf32>,  tensor<?x?xf32>)
-                          outs(%C: tensor<?x?xf32>) -> tensor<?x?xf32>
+                          ins(%A, %transposed_B : tensor<?x?xf32>, tensor<?x?xf32>)
+                          outs(%C : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %result : tensor<?x?xf32>
 }
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[BROADCASTED:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+//
+// CHECK:  func.func @unary_broadcasted(%[[A:.+]]: tensor<8x32xf32>, %[[B:.+]]: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<exp>
+// CHECK-SAME:       indexing_maps = [#[[BROADCASTED]], #[[IDENTITY]]]
+// CHECK-SAME:       ins(%[[A]] : tensor<8x32xf32>) outs(%[[B]] : tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+// CHECK-NEXT:    return %[[RES]] : tensor<8x16x32xf32>
+//
+func.func @unary_broadcasted(%A: tensor<8x32xf32>, %B: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
+  %empty = tensor.empty() : tensor<8x16x32xf32>
+  %broadcasted_A = linalg.broadcast ins(%A : tensor<8x32xf32>) outs(%empty : tensor<8x16x32xf32>) dimensions = [1]
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<exp>
+                          ins(%broadcasted_A : tensor<8x16x32xf32>) outs(%B : tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+  return %result : tensor<8x16x32xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-DAG: #[[BROADCASTED:.+]] = affine_map<(d0, d1) -> (d0)>
+//
+// CHECK:  func.func @binary_broadcasted(%[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?xf32>, %[[C:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
+// CHECK-SAME:              indexing_maps = [#[[IDENTITY]], #[[BROADCASTED]], #[[IDENTITY]]]
+// CHECK-SAME:              ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?xf32>) outs(%[[C]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NEXT:  return %[[RES]] : tensor<?x?xf32>
+//
+func.func @binary_broadcasted(%A: tensor<?x?xf32>, %B: tensor<?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %dim0 = tensor.dim %A, %c0 : tensor<?x?xf32>
+  %dim1 = tensor.dim %A, %c1 : tensor<?x?xf32>
+
+  %empty = tensor.empty(%dim1, %dim0) : tensor<?x?xf32>
+  %broadcasted_B = linalg.broadcast ins(%B : tensor<?xf32>) outs(%empty : tensor<?x?xf32>) dimensions = [1]
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<add>
+                          ins(%A, %broadcasted_B : tensor<?x?xf32>, tensor<?x?xf32>)
+                          outs(%C : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %result : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-DAG: #[[COMPOSED_MAP:.+]] = affine_map<(d0, d1) -> (d0)>
+//
+// CHECK:  func.func @fold_broadcast_after_transpose_fold(%[[A:.+]]: tensor<16xf32>, %[[B:.+]]: tensor<16x32xf32>) -> tensor<16x32xf32> {
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<exp>
+// CHECK-SAME:              indexing_maps = [#[[COMPOSED_MAP]], #[[IDENTITY]]]
+// CHECK-SAME:              ins(%[[A]] : tensor<16xf32>) outs(%[[B]] : tensor<16x32xf32>) -> tensor<16x32xf32>
+// CHECK-NEXT:  return %[[RES]] : tensor<16x32xf32>
+//
+#identity = affine_map<(d0, d1) -> (d0, d1)>
+#transpose = affine_map<(d0, d1) -> (d1, d0)>
+
+func.func @fold_broadcast_after_transpose_fold(%A: tensor<16xf32>, %B: tensor<16x32xf32>) -> tensor<16x32xf32> {
+  %empty_b = tensor.empty() : tensor<32x16xf32>
+
+  %broadcasted_A = linalg.broadcast ins(%A : tensor<16xf32>) outs(%empty_b : tensor<32x16xf32>) dimensions = [0]
+
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<exp>
+                          indexing_maps = [#transpose, #identity]
+                          ins(%broadcasted_A : tensor<32x16xf32>) outs(%B : tensor<16x32xf32>) -> tensor<16x32xf32>
+  return %result : tensor<16x32xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[COMPOSED_MAP:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+//
+// CHECK:  func.func @fold_transpose_after_broadcast_fold(%[[A:.+]]: tensor<32x16xf32>, %[[B:.+]]: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<exp>
+// CHECK-SAME:              indexing_maps = [#[[COMPOSED_MAP]], #[[IDENTITY]]]
+// CHECK-SAME:              ins(%[[A]] : tensor<32x16xf32>) outs(%[[B]] : tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+// CHECK-NEXT:  return %[[RES]] : tensor<8x16x32xf32>
+//
+#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#broadcast = affine_map<(d0, d1, d2) -> (d1, d2)>
+
+func.func @fold_transpose_after_broadcast_fold(%A: tensor<32x16xf32>, %B: tensor<8x16x32xf32>) -> tensor<8x16x32xf32> {
+  %empty_t = tensor.empty() : tensor<16x32xf32>
+  %transposed_A = linalg.transpose ins(%A : tensor<32x16xf32>) outs(%empty_t : tensor<16x32xf32>) permutation = [1, 0]
+
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<exp>
+                          indexing_maps = [#broadcast, #identity]
+                          ins(%transposed_A : tensor<16x32xf32>) outs(%B : tensor<8x16x32xf32>) -> tensor<8x16x32xf32>
+  return %result : tensor<8x16x32xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-DAG: #[[COMPOSED_MAP:.+]] = affine_map<(d0, d1) -> (d0)>
+//
+// CHECK:  func.func @fold_broadcast_after_transpose_fold_binary(%[[A:.+]]: tensor<?xf32>, %[[B:.+]]: tensor<?x?xf32>, %[[C:.+]]: tensor<?x?xf32>) -> tensor<?x?xf32> {
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
+// CHECK-SAME:              indexing_maps = [#[[COMPOSED_MAP]], #[[IDENTITY]], #[[IDENTITY]]]
+// CHECK-SAME:              ins(%[[A]], %[[B]] : tensor<?xf32>, tensor<?x?xf32>) outs(%[[C]] : tensor<?x?xf32>) -> tensor<?x?xf32>
+// CHECK-NEXT:  return %[[RES]] : tensor<?x?xf32>
+//
+#identity = affine_map<(d0, d1) -> (d0, d1)>
+#transpose = affine_map<(d0, d1) -> (d1, d0)>
+
+func.func @fold_broadcast_after_transpose_fold_binary(%A: tensor<?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %dim0 = tensor.dim %B, %c0 : tensor<?x?xf32>
+  %dim1 = tensor.dim %B, %c1 : tensor<?x?xf32>
+
+  %empty_b = tensor.empty(%dim1, %dim0) : tensor<?x?xf32>
+  %broadcasted_A = linalg.broadcast ins(%A : tensor<?xf32>) outs(%empty_b : tensor<?x?xf32>) dimensions = [0]
+
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<add>
+                          indexing_maps = [#transpose, #identity, #identity]
+                          ins(%broadcasted_A, %B : tensor<?x?xf32>, tensor<?x?xf32>) outs(%C : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  return %result : tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+// CHECK-DAG: #[[COMPOSED_MAP:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+//
+// CHECK:  func.func @fold_transpose_after_broadcast_fold_binary(%[[A:.+]]: tensor<?x?xf32>, %[[B:.+]]: tensor<?x?x?xf32>, %[[C:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
+// CHECK-SAME:              indexing_maps = [#[[COMPOSED_MAP]], #[[IDENTITY]], #[[IDENTITY]]]
+// CHECK-SAME:              ins(%[[A]], %[[B]] : tensor<?x?xf32>, tensor<?x?x?xf32>) outs(%[[C]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK-NEXT:  return %[[RES]] : tensor<?x?x?xf32>
+//
+#identity = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#broadcast = affine_map<(d0, d1, d2) -> (d1, d2)>
+
+func.func @fold_transpose_after_broadcast_fold_binary(%A: tensor<?x?xf32>, %B: tensor<?x?x?xf32>, %C: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %dim0 = tensor.dim %B, %c0 : tensor<?x?x?xf32>
+  %dim1 = tensor.dim %B, %c1 : tensor<?x?x?xf32>
+  %dim2 = tensor.dim %B, %c2 : tensor<?x?x?xf32>
+
+  %empty_t = tensor.empty(%dim1, %dim2) : tensor<?x?xf32>
+  %transposed_A = linalg.transpose ins(%A : tensor<?x?xf32>) outs(%empty_t : tensor<?x?xf32>) permutation = [1, 0]
+
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<add>
+                          indexing_maps = [#broadcast, #identity, #identity]
+                          ins(%transposed_A, %B : tensor<?x?xf32>, tensor<?x?x?xf32>) outs(%C : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %result : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #[[DIAGONAL:.+]] = affine_map<(d0) -> (d0, d0)>
+//
+// CHECK:  func.func @fold_failed_diagonal_map(%[[A:.+]]: tensor<16xf32>, %[[B:.+]]: tensor<16xf32>, %[[C:.+]]: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-NEXT:  %[[EMPTY:.+]] = tensor.empty() : tensor<16x16xf32>
+// CHECK-NEXT:  %[[BROADCASTED_B:.+]] = linalg.broadcast ins(%[[B]] : tensor<16xf32>) outs(%[[EMPTY]] : tensor<16x16xf32>) dimensions = [0]
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
+// CHECK-SAME:              indexing_maps = [#[[IDENTITY]], #[[DIAGONAL]], #[[IDENTITY]]]
+// CHECK-SAME:              ins(%[[A]], %[[BROADCASTED_B]] : tensor<16xf32>, tensor<16x16xf32>) outs(%[[C]] : tensor<16xf32>) -> tensor<16xf32>
+// CHECK-NEXT:  return %[[RES]] : tensor<16xf32>
+//
+#identity = affine_map<(d0) -> (d0)>
+#diagonal = affine_map<(d0) -> (d0, d0)>
+
+func.func @fold_failed_diagonal_map(%A: tensor<16xf32>, %B: tensor<16xf32>, %C: tensor<16xf32>) -> tensor<16xf32> {
+  %empty = tensor.empty() : tensor<16x16xf32>
+  %broadcasted_B = linalg.broadcast ins(%B : tensor<16xf32>) outs(%empty : tensor<16x16xf32>) dimensions = [0]
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<add>
+                          indexing_maps = [#identity, #diagonal, #identity]
+                          ins(%A, %broadcasted_B : tensor<16xf32>, tensor<16x16xf32>) outs(%C : tensor<16xf32>) -> tensor<16xf32>
+  return %result : tensor<16xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[IDENTITY:.+]] = affine_map<(d0) -> (d0)>
+// CHECK-DAG: #[[CONSTANT:.+]] = affine_map<(d0) -> (0, d0)>
+//
+// CHECK:  func.func @fold_failed_constant_map(%[[A:.+]]: tensor<16xf32>, %[[B:.+]]: tensor<16x32xf32>, %[[C:.+]]: tensor<16xf32>) -> tensor<16xf32> {
+// CHECK-NEXT:  %[[EMPTY:.+]] = tensor.empty() : tensor<32x16xf32>
+// CHECK-NEXT:  %[[TRANSPOSED_B:.+]] = linalg.transpose ins(%[[B]] : tensor<16x32xf32>) outs(%[[EMPTY]] : tensor<32x16xf32>) permutation = [1, 0]
+// CHECK-NEXT:  %[[RES:.+]] = linalg.elementwise kind=#linalg.elementwise_kind<add>
+// CHECK-SAME:              indexing_maps = [#[[IDENTITY]], #[[CONSTANT]], #[[IDENTITY]]]
+// CHECK-SAME:              ins(%[[A]], %[[TRANSPOSED_B]] : tensor<16xf32>, tensor<32x16xf32>) outs(%[[C]] : tensor<16xf32>) -> tensor<16xf32>
+// CHECK-NEXT:  return %[[RES]] : tensor<16xf32>
+//
+#identity = affine_map<(d0) -> (d0)>
+#constant = affine_map<(d0) -> (0, d0)>
+
+func.func @fold_failed_constant_map(%A: tensor<16xf32>, %B: tensor<16x32xf32>, %C: tensor<16xf32>) -> tensor<16xf32> {
+  %empty = tensor.empty() : tensor<32x16xf32>
+  %transposed_B = linalg.transpose ins(%B : tensor<16x32xf32>) outs(%empty : tensor<32x16xf32>) permutation = [1, 0]
+  %result = linalg.elementwise kind=#linalg.elementwise_kind<add>
+                          indexing_maps = [#identity, #constant, #identity]
+                          ins(%A, %transposed_B : tensor<16xf32>, tensor<32x16xf32>) outs(%C : tensor<16xf32>) -> tensor<16xf32>
+  return %result : tensor<16xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir
index bdd29b96346e1..7bad1b7a44d92 100644
--- a/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir
+++ b/mlir/test/Dialect/Linalg/linalg-morph-multi-step.mlir
@@ -2,13 +2,49 @@
 // RUN: mlir-opt %s -linalg-morph-ops=named-to-generic |  mlir-opt -linalg-morph-ops=generic-to-named | \
 // RUN:   FileCheck %s  --check-prefix=ROUND_TRIP
 
-func.func @exp(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) ->  tensor<16x8xf32> {
+func.func @unary_ops(%A : tensor<16x8xf32>, %B : tensor<16x8xf32>) ->  tensor<16x8xf32> {
   %exp = linalg.exp ins(%A : tensor<16x8xf32>) outs(%B :  tensor<16x8xf32>) -> tensor<16x8xf32>
-  return %exp :  tensor<16x8xf32>
+  %log = linalg.log ins(%exp : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %abs = linalg.abs ins(%log : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %ceil = linalg.ceil ins(%abs : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %floor = linalg.floor ins(%ceil : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %negf = linalg.negf ins(%floor : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %recip = linalg.reciprocal ins(%negf : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %round = linalg.round ins(%recip : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %sqrt = linalg.sqrt ins(%round : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %rsqrt = linalg.rsqrt ins(%sqrt : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %square = linalg.square ins(%rsqrt : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %tanh = linalg.tanh ins(%square : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  %erf = linalg.erf ins(%tanh : tensor<16x8xf32>) outs(%B : tensor<16x8xf32>) -> tensor<16x8xf32>
+  return %erf :  tensor<16x8xf32>
 }
 
-// NAMED_TO_GENERIC: linalg.generic
+// NAMED_TO_GENERIC-COUNT-13: linalg.generic
 // NAMED_TO_GENERIC-NOT: linalg.exp
+// NAMED_TO_GENERIC-NOT: linalg.log
+// NAMED_TO_GENERIC-NOT: linalg.abs
+// NAMED_TO_GENERIC-NOT: linalg.ceil
+// NAMED_TO_GENERIC-NOT: linalg.floor
+// NAMED_TO_GENERIC-NOT: linalg.negf
+// NAMED_TO_GENERIC-NOT: linalg.reciprocal
+// NAMED_TO_GENERIC-NOT: linalg.round
+// NAMED_TO_GENERIC-NOT: linalg.sqrt
+// NAMED_TO_GENERIC-NOT: linalg.rsqrt
+// NAMED_TO_GENERIC-NOT: linalg.square
+// NAMED_TO_GENERIC-NOT: linalg.tanh
+// NAMED_TO_GENERIC-NOT: linalg.erf
 
 // ROUND_TRIP: linalg.exp
+// ROUND_TRIP: linalg.log
+// ROUND_TRIP: linalg.abs
+// ROUND_TRIP: linalg.ceil
+// ROUND_TRIP: linalg.floor
+// ROUND_TRIP: linalg.negf
+// ROUND_TRIP: linalg.reciprocal
+// ROUND_TRIP: linalg.round
+// ROUND_TRIP: linalg.sqrt
+// ROUND_TRIP: linalg.rsqrt
+// ROUND_TRIP: linalg.square
+// ROUND_TRIP: linalg.tanh
+// ROUND_TRIP: linalg.erf
 // ROUND_TRIP-NOT: linalg.generic
diff --git a/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir
index 19b30ef10da84..69a1a7f650810 100644
--- a/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip-morphism-linalg-named-ops.mlir
@@ -5,17 +5,65 @@
 // RUN: | mlir-opt -split-input-file -linalg-morph-ops=generic-to-named \
 // RUN: | FileCheck %s
 
-func.func @unary_exp(%A: memref<7x14x21xf32>, %Out: memref<7x14x21xf32>) {
+func.func @unary_ops(%A: memref<7x14x21xf32>, %Out: memref<7x14x21xf32>) {
   linalg.exp ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.log ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.abs ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.ceil ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.floor ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.negf ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.reciprocal ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.round ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.sqrt ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.rsqrt ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.square ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.tanh ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
+  linalg.erf ins(%A : memref<7x14x21xf32>) outs(%Out : memref<7x14x21xf32>)
   return
 }
 
-// CHECK-LABEL: unary_exp
+// CHECK-LABEL: unary_ops
 // CHECK-SAME: %[[A:.+]]: memref<7x14x21xf32>, %[[OUT:.+]]: memref<7x14x21xf32>)
 // CHECK-NOT: linalg.generic
 // CHECK: linalg.exp
 // CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
 // CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.log
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.abs
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.ceil
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.floor
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.negf
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.reciprocal
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.round
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.sqrt
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.rsqrt
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.square
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.tanh
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
+// CHECK: linalg.erf
+// CHECK-SAME: ins(%[[A]] : memref<7x14x21xf32>)
+// CHECK-SAME: outs(%[[OUT]] : memref<7x14x21xf32>)
 
 // -----
 
diff --git a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
index 029d11a4f60de..9cc24dd07ae47 100644
--- a/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
+++ b/mlir/test/Dialect/Linalg/specialize-generic-ops.mlir
@@ -5,26 +5,147 @@
 // RUN: | FileCheck %s --check-prefix=CATEGORY,ALL
 
 #umap = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-func.func @unary_op_exp(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+func.func @unary_ops(%A: tensor<?x?x?xf32>, %Out: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   %0 = linalg.generic
     {indexing_maps = [#umap, #umap],
     iterator_types = ["parallel", "parallel","parallel"]}
     ins(%A : tensor<?x?x?xf32>)
     outs(%Out : tensor<?x?x?xf32>) {
   ^bb0(%in: f32, %out: f32):
-    %1 = math.exp %in : f32
-    linalg.yield %1 : f32
+    %v = math.exp %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %1 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%0 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.log %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %2 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%1 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.absf %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %3 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%2 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.ceil %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %4 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%3 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.floor %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %5 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%4 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = arith.negf %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %cst_1 = arith.constant 1.0 : f32
+  %6 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%5 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = arith.divf %cst_1, %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %7 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%6 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.round %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %8 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%7 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.sqrt %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %9 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%8 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.rsqrt %in : f32
+    linalg.yield %v : f32
   } -> tensor<?x?x?xf32>
-  return %0 : tensor<?x?x?xf32>
+  %10 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%9 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = arith.mulf %in, %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %11 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%10 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.tanh %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %12 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%11 : tensor<?x?x?xf32>) outs(%Out : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.erf %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  return %12 : tensor<?x?x?xf32>
 }
 
-// ALL-LABEL: unary_op_exp
+// ALL-LABEL: unary_ops
 // ALL-SAME: %[[A:.+]]: tensor<?x?x?xf32>, %[[OUT:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 
 // NAMED-NOT: linalg.generic
-// NAMED: linalg.exp
+// NAMED: %[[RES0:.+]] = linalg.exp
 // NAMED-SAME: ins(%[[A]] : tensor<?x?x?xf32>)
 // NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES1:.+]] = linalg.log
+// NAMED-SAME: ins(%[[RES0]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES2:.+]] = linalg.abs
+// NAMED-SAME: ins(%[[RES1]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES3:.+]] = linalg.ceil
+// NAMED-SAME: ins(%[[RES2]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES4:.+]] = linalg.floor
+// NAMED-SAME: ins(%[[RES3]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES5:.+]] = linalg.negf
+// NAMED-SAME: ins(%[[RES4]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES6:.+]] = linalg.reciprocal
+// NAMED-SAME: ins(%[[RES5]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES7:.+]] = linalg.round
+// NAMED-SAME: ins(%[[RES6]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES8:.+]] = linalg.sqrt
+// NAMED-SAME: ins(%[[RES7]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES9:.+]] = linalg.rsqrt
+// NAMED-SAME: ins(%[[RES8]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES10:.+]] = linalg.square
+// NAMED-SAME: ins(%[[RES9]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES11:.+]] = linalg.tanh
+// NAMED-SAME: ins(%[[RES10]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// NAMED: %[[RES12:.+]] = linalg.erf
+// NAMED-SAME: ins(%[[RES11]] : tensor<?x?x?xf32>)
+// NAMED-SAME: outs(%[[OUT]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 
 // Not supported yet.
 // CATEGORY: linalg.generic
diff --git a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
index 6b03885069a37..3897f8502bb04 100644
--- a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
@@ -1,11 +1,26 @@
 // RUN: mlir-opt --transform-interpreter --split-input-file %s | FileCheck %s
-// Test the same patterns on generic convolution ops by first generalizing the
-// named ops. This avoids duplicating lit tests for linalg.generic conv ops.
-// RUN: mlir-opt --linalg-generalize-named-ops --transform-interpreter --split-input-file %s | FileCheck %s
+
+// Expected indexing maps for batchless conv_1d_nwc_wcf.
+// CHECK-DAG:  #[[$CONV_I:.+]] = affine_map<(d0, d1, d2, d3) -> (d0 + d2, d3)>
+// CHECK-DAG:  #[[$CONV_F:.+]] = affine_map<(d0, d1, d2, d3) -> (d2, d3, d1)>
+// CHECK-DAG:  #[[$CONV_O:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+
+// Expected indexing maps for batchless depthwise_conv_1d_wc_wcf.
+// CHECK-DAG:  #[[$DW_I:.+]] = affine_map<(d0, d1, d2) -> (d0 + d2, d1)>
+// CHECK-DAG:  #[[$DW_F:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+
+// Expected indexing maps for batchless pooling_cw_min.
+// CHECK-DAG:  #[[$POOL_I:.+]] = affine_map<(d0, d1, d2) -> (d0, d1 + d2)>
+// CHECK-DAG:  #[[$POOL_F:.+]] = affine_map<(d0, d1, d2) -> (d2)>
 
 // CHECK-DAG:  #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 // CHECK-DAG:  #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 
+// Expected indexing maps for 1D conv (cross-conv after downscale from generic).
+// CHECK-DAG:  #[[$CROSS_1D_I:.+]] = affine_map<(d0, d1) -> (d0 + d1)>
+// CHECK-DAG:  #[[$CROSS_1D_F:.+]] = affine_map<(d0, d1) -> (d1)>
+// CHECK-DAG:  #[[$CROSS_1D_O:.+]] = affine_map<(d0, d1) -> (d0)>
+
 // CHECK-LABEL: @conv_2d_nhwc_hwcf
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?x1x?x?xf32>,
 // CHECK-SAME: %[[ARG1:.+]]: tensor<1x?x?x?xf32>
@@ -42,10 +57,11 @@ func.func @conv_2d_nchw_fchw(%input: tensor<?x?x1x?xf32>, %filter: tensor<?x?x1x
   return %0 : tensor<?x?x1x?xf32>
 }
 
-// CHECK-LABEL: @depthwise_conv_2d_nhwc_hwc
+// Depthwise conv with height=1 (downscales height dimension)
+// CHECK-LABEL: @depthwise_conv_2d_nhwc_hwc_height
 // CHECK-SAME: %[[ARG0:.+]]: tensor<1x1x113x96xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<1x3x96xf32>
-func.func @depthwise_conv_2d_nhwc_hwc(%input: tensor<1x1x113x96xf32>, %filter: tensor<1x3x96xf32>) -> tensor<1x1x56x96xf32> {
+func.func @depthwise_conv_2d_nhwc_hwc_height(%input: tensor<1x1x113x96xf32>, %filter: tensor<1x3x96xf32>) -> tensor<1x1x56x96xf32> {
   // CHECK: %[[RES:.+]] = tensor.empty
   %init = tensor.empty() : tensor<1x1x56x96xf32>
   // CHECK: %[[SLICE0:.+]] = tensor.extract_slice %[[ARG0]]
@@ -62,6 +78,27 @@ func.func @depthwise_conv_2d_nhwc_hwc(%input: tensor<1x1x113x96xf32>, %filter: t
   return %0: tensor<1x1x56x96xf32>
 }
 
+// Depthwise conv with width=1 (downscales width dimension)
+// CHECK-LABEL: @depthwise_conv_2d_nhwc_hwc_width
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x113x1x96xf32>
+// CHECK-SAME: %[[ARG1:.+]]: tensor<3x1x96xf32>
+func.func @depthwise_conv_2d_nhwc_hwc_width(%input: tensor<1x113x1x96xf32>, %filter: tensor<3x1x96xf32>) -> tensor<1x56x1x96xf32> {
+  // CHECK: %[[RES:.+]] = tensor.empty
+  %init = tensor.empty() : tensor<1x56x1x96xf32>
+  // CHECK: %[[SLICE0:.+]] = tensor.extract_slice %[[ARG0]]
+  // CHECK: %[[SLICE1:.+]] = tensor.extract_slice %[[ARG1]]
+  // CHECK: %[[SLICERES:.+]] = tensor.extract_slice %[[RES]]
+  // CHECK: %[[OPRES:.+]] = linalg.depthwise_conv_1d_nwc_wc
+  // CHECK-SAME: ins(%[[SLICE0]], %[[SLICE1]]
+  // CHECK-SAME: outs(%[[SLICERES]]
+  // CHECK: %[[INSERTED:.+]] = tensor.insert_slice %[[OPRES]] into %[[RES]]
+  %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>}
+         ins(%input, %filter: tensor<1x113x1x96xf32>, tensor<3x1x96xf32>)
+         outs(%init: tensor<1x56x1x96xf32>) -> tensor<1x56x1x96xf32>
+  // CHECK: %[[INSERTED]]
+  return %0: tensor<1x56x1x96xf32>
+}
+
 // CHECK-LABEL: @conv_2d
 // CHECK-SAME: (%[[ARG0:[0-9a-z]+]]: tensor<1x?xf32>,
 // CHECK-SAME: %[[ARG1:[0-9a-z]+]]: tensor<1x?xf32>,
@@ -205,6 +242,125 @@ func.func @pooling_nchw_max(%input: tensor<?x?x1x?xf32>, %filter: tensor<1x?xf32
   return %0 : tensor<?x?x1x?xf32>
 }
 
+#map_conv_i = affine_map<(oh, ow, f, kh, kw, c) -> (oh + kh, ow + kw, c)>
+#map_conv_f = affine_map<(oh, ow, f, kh, kw, c) -> (kh, kw, c, f)>
+#map_conv_o = affine_map<(oh, ow, f, kh, kw, c) -> (oh, ow, f)>
+
+// CHECK-LABEL: @batchless_conv_2d_hwc_hwcf
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x14x8xf32>
+// CHECK-SAME:    %[[ARG1:.+]]: tensor<1x3x8x16xf32>
+// CHECK-SAME:    %[[ARG2:.+]]: tensor<1x12x16xf32>
+func.func @batchless_conv_2d_hwc_hwcf(%input: tensor<1x14x8xf32>, %filter: tensor<1x3x8x16xf32>, %output: tensor<1x12x16xf32>) -> tensor<1x12x16xf32> {
+  // CHECK:       %[[SLICE0:.+]] = tensor.extract_slice %[[ARG0]]
+  // CHECK:       %[[SLICE1:.+]] = tensor.extract_slice %[[ARG1]]
+  // CHECK:       %[[SLICE2:.+]] = tensor.extract_slice %[[ARG2]]
+  // CHECK:       %[[SLICERES:.+]] = linalg.generic
+  // CHECK-SAME:    indexing_maps = [#[[$CONV_I]], #[[$CONV_F]], #[[$CONV_O]]]
+  // CHECK-SAME:    iterator_types = ["parallel", "parallel", "reduction", "reduction"]
+  // CHECK:       %[[RES:.+]] = tensor.insert_slice %[[SLICERES]] into %[[ARG2]]
+  %0 = linalg.generic {
+    indexing_maps = [#map_conv_i, #map_conv_f, #map_conv_o],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction", "reduction"]
+  } ins(%input, %filter : tensor<1x14x8xf32>, tensor<1x3x8x16xf32>)
+    outs(%output : tensor<1x12x16xf32>) {
+  ^bb0(%in: f32, %fil: f32, %out: f32):
+    %mul = arith.mulf %in, %fil : f32
+    %add = arith.addf %out, %mul : f32
+    linalg.yield %add : f32
+  } -> tensor<1x12x16xf32>
+  // CHECK:       return %[[RES]]
+  return %0 : tensor<1x12x16xf32>
+}
+
+#map_dw_i = affine_map<(oh, ow, c, kh, kw) -> (oh + kh, ow + kw, c)>
+#map_dw_f = affine_map<(oh, ow, c, kh, kw) -> (kh, kw, c)>
+#map_dw_o = affine_map<(oh, ow, c, kh, kw) -> (oh, ow, c)>
+
+// CHECK-LABEL: @batchless_depthwise_conv_2d_hwc_hwc
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x14x8xf32>
+// CHECK-SAME:    %[[ARG1:.+]]: tensor<1x3x8xf32>
+// CHECK-SAME:    %[[ARG2:.+]]: tensor<1x12x8xf32>
+func.func @batchless_depthwise_conv_2d_hwc_hwc(%input: tensor<1x14x8xf32>, %filter: tensor<1x3x8xf32>, %output: tensor<1x12x8xf32>) -> tensor<1x12x8xf32> {
+  // CHECK:       %[[SLICE0:.+]] = tensor.extract_slice %[[ARG0]]
+  // CHECK:       %[[SLICE1:.+]] = tensor.extract_slice %[[ARG1]]
+  // CHECK:       %[[SLICE2:.+]] = tensor.extract_slice %[[ARG2]]
+  // CHECK:       %[[SLICERES:.+]] = linalg.generic
+  // CHECK-SAME:    indexing_maps = [#[[$DW_I]], #[[$DW_F]], #[[$MAP1]]]
+  // CHECK-SAME:    iterator_types = ["parallel", "parallel", "reduction"]
+  // CHECK:       %[[RES:.+]] = tensor.insert_slice %[[SLICERES]] into %[[ARG2]]
+  %0 = linalg.generic {
+    indexing_maps = [#map_dw_i, #map_dw_f, #map_dw_o],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+  } ins(%input, %filter : tensor<1x14x8xf32>, tensor<1x3x8xf32>)
+    outs(%output : tensor<1x12x8xf32>) {
+  ^bb0(%in: f32, %fil: f32, %out: f32):
+    %mul = arith.mulf %in, %fil : f32
+    %add = arith.addf %out, %mul : f32
+    linalg.yield %add : f32
+  } -> tensor<1x12x8xf32>
+  // CHECK:       return %[[RES]]
+  return %0 : tensor<1x12x8xf32>
+}
+
+#map_pool_i = affine_map<(c, oh, ow, kh, kw) -> (c, oh + kh, ow + kw)>
+#map_pool_f = affine_map<(c, oh, ow, kh, kw) -> (kh, kw)>
+#map_pool_o = affine_map<(c, oh, ow, kh, kw) -> (c, oh, ow)>
+
+// CHECK-LABEL: @batchless_pooling_chw_min
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<8x1x14xf32>
+// CHECK-SAME:    %[[ARG1:.+]]: tensor<1x3xf32>
+// CHECK-SAME:    %[[ARG2:.+]]: tensor<8x1x12xf32>
+func.func @batchless_pooling_chw_min(%input: tensor<8x1x14xf32>, %filter: tensor<1x3xf32>, %output: tensor<8x1x12xf32>) -> tensor<8x1x12xf32> {
+  // CHECK:       %[[SLICE0:.+]] = tensor.extract_slice %[[ARG0]]
+  // CHECK:       %[[SLICE1:.+]] = tensor.extract_slice %[[ARG1]]
+  // CHECK:       %[[SLICE2:.+]] = tensor.extract_slice %[[ARG2]]
+  // CHECK:       %[[SLICERES:.+]] = linalg.generic
+  // CHECK-SAME:    indexing_maps = [#[[$POOL_I]], #[[$POOL_F]], #[[$MAP1]]]
+  // CHECK-SAME:    iterator_types = ["parallel", "parallel", "reduction"]
+  // CHECK:       %[[RES:.+]] = tensor.insert_slice %[[SLICERES]] into %[[ARG2]]
+  %0 = linalg.generic {
+    indexing_maps = [#map_pool_i, #map_pool_f, #map_pool_o],
+    iterator_types = ["parallel", "parallel", "parallel", "reduction", "reduction"]
+  } ins(%input, %filter : tensor<8x1x14xf32>, tensor<1x3xf32>)
+    outs(%output : tensor<8x1x12xf32>) {
+  ^bb0(%in: f32, %fil: f32, %out: f32):
+    %min = arith.minimumf %out, %in : f32
+    linalg.yield %min : f32
+  } -> tensor<8x1x12xf32>
+  // CHECK:       return %[[RES]]
+  return %0 : tensor<8x1x12xf32>
+}
+
+#map_cross_i = affine_map<(d0, d1, d2, d3) -> (d0 + d3, d1 + d2)>
+#map_cross_f = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map_cross_o = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
+
+// CHECK-LABEL: @cross_conv_nonstandard_loop_order
+// CHECK-SAME:    %[[ARG0:.+]]: tensor<1x15xf32>
+// CHECK-SAME:    %[[ARG1:.+]]: tensor<3x1xf32>
+// CHECK-SAME:    %[[ARG2:.+]]: tensor<1x12xf32>
+func.func @cross_conv_nonstandard_loop_order(%input: tensor<1x15xf32>, %filter: tensor<3x1xf32>, %output: tensor<1x12xf32>) -> tensor<1x12xf32> {
+  // CHECK:       %[[SLICE0:.+]] = tensor.extract_slice %[[ARG0]]
+  // CHECK:       %[[SLICE1:.+]] = tensor.extract_slice %[[ARG1]]
+  // CHECK:       %[[SLICE2:.+]] = tensor.extract_slice %[[ARG2]]
+  // CHECK:       %[[SLICERES:.+]] = linalg.generic
+  // CHECK-SAME:    indexing_maps = [#[[$CROSS_1D_I]], #[[$CROSS_1D_F]], #[[$CROSS_1D_O]]]
+  // CHECK-SAME:    iterator_types = ["parallel", "reduction"]
+  // CHECK:       %[[RES:.+]] = tensor.insert_slice %[[SLICERES]] into %[[ARG2]]
+  // CHECK:       return %[[RES]]
+  %0 = linalg.generic {
+    indexing_maps = [#map_cross_i, #map_cross_f, #map_cross_o],
+    iterator_types = ["parallel", "parallel", "reduction", "reduction"]
+  } ins(%input, %filter : tensor<1x15xf32>, tensor<3x1xf32>)
+    outs(%output : tensor<1x12xf32>) {
+  ^bb0(%in: f32, %fil: f32, %out: f32):
+    %mul = arith.mulf %in, %fil : f32
+    %add = arith.addf %out, %mul : f32
+    linalg.yield %add : f32
+  } -> tensor<1x12xf32>
+  return %0 : tensor<1x12xf32>
+}
+
 func.func @softmax(%arg0: tensor<2x16x32xf32>, %dst: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> {
   %1 = linalg.softmax dimension(2) ins(%arg0 : tensor<2x16x32xf32>) outs(%dst: tensor<2x16x32xf32>) -> tensor<2x16x32xf32>
   return %1 : tensor<2x16x32xf32>
diff --git a/mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-unary.mlir b/mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-unary.mlir
index 89a8baa453e90..3a2c7c9965287 100644
--- a/mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-unary.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-specialize-elemwise-unary.mlir
@@ -6,15 +6,112 @@ func.func @specialize_exp(%arg0: tensor<?x?x?xf32>, %arg1: tensor<?x?x?xf32>) ->
           {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
           ins(%arg0 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
   ^bb0(%in: f32, %out: f32):
-    %1 = math.exp %in : f32
-    linalg.yield %1 : f32
+    %v = math.exp %in : f32
+    linalg.yield %v : f32
   } -> tensor<?x?x?xf32>
-  return %0 : tensor<?x?x?xf32>
+  %1 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%0 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.log %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %2 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%1 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.absf %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %3 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%2 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.ceil %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %4 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%3 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.floor %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %5 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%4 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = arith.negf %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %cst_1 = arith.constant 1.0 : f32
+  %6 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%5 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = arith.divf %cst_1, %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %7 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%6 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.round %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %8 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%7 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.sqrt %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %9 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%8 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.rsqrt %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %10 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%9 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = arith.mulf %in, %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %11 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%10 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.tanh %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  %12 = linalg.generic
+          {indexing_maps = [#umap, #umap], iterator_types = ["parallel", "parallel","parallel"]}
+          ins(%11 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+  ^bb0(%in: f32, %out: f32):
+    %v = math.erf %in : f32
+    linalg.yield %v : f32
+  } -> tensor<?x?x?xf32>
+  return %12 : tensor<?x?x?xf32>
 }
 // CHECK-LABEL: specialize_exp
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 // CHECK-NOT: linalg.generic
-// CHECK: linalg.exp ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES0:.+]] = linalg.exp ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES1:.+]] = linalg.log ins(%[[RES0]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES2:.+]] = linalg.abs ins(%[[RES1]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES3:.+]] = linalg.ceil ins(%[[RES2]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES4:.+]] = linalg.floor ins(%[[RES3]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES5:.+]] = linalg.negf ins(%[[RES4]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES6:.+]] = linalg.reciprocal ins(%[[RES5]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES7:.+]] = linalg.round ins(%[[RES6]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES8:.+]] = linalg.sqrt ins(%[[RES7]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES9:.+]] = linalg.rsqrt ins(%[[RES8]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES10:.+]] = linalg.square ins(%[[RES9]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES11:.+]] = linalg.tanh ins(%[[RES10]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+// CHECK: %[[RES12:.+]] = linalg.erf ins(%[[RES11]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index bc508d66fbd5f..cbe18b9b882da 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -3180,6 +3180,48 @@ func.func @iterator_bad_result_type(%lb : index, %ub : index, %st : index) {
 
 // -----
 
+func.func @iterator_zero_step(%s2 : !llvm.struct<(ptr, i64)>) {
+  %lb = arith.constant 1 : index
+  %ub = arith.constant 4 : index
+  %st = arith.constant 0 : index
+
+  // expected-error at +1 {{loop step must not be zero}}
+  %0 = omp.iterator(%iv: index) = (%lb to %ub step %st) {
+    omp.yield(%s2 : !llvm.struct<(ptr, i64)>)
+  } -> !omp.iterated<!llvm.struct<(ptr, i64)>>
+  return
+}
+
+// -----
+
+func.func @iterator_positive_step_wrong_direction(%s2 : !llvm.struct<(ptr, i64)>) {
+  %lb = arith.constant 1000 : index
+  %ub = arith.constant -1 : index
+  %st = arith.constant 10 : index
+
+  // expected-error at +1 {{positive loop step requires lower bound to be less than or equal to upper bound}}
+  %0 = omp.iterator(%iv: index) = (%lb to %ub step %st) {
+    omp.yield(%s2 : !llvm.struct<(ptr, i64)>)
+  } -> !omp.iterated<!llvm.struct<(ptr, i64)>>
+  return
+}
+
+// -----
+
+func.func @iterator_negative_step_wrong_direction(%s2 : !llvm.struct<(ptr, i64)>) {
+  %lb = arith.constant -1000 : index
+  %ub = arith.constant 4 : index
+  %st = arith.constant -999 : index
+
+  // expected-error at +1 {{negative loop step requires lower bound to be greater than or equal to upper bound}}
+  %0 = omp.iterator(%iv: index) = (%lb to %ub step %st) {
+    omp.yield(%s2 : !llvm.struct<(ptr, i64)>)
+  } -> !omp.iterated<!llvm.struct<(ptr, i64)>>
+  return
+}
+
+// -----
+
 func.func @iterator_missing_yield(%lb : index, %ub : index, %st : index) {
   // expected-error at +1 {{region must be terminated by omp.yield}}
   %0 = omp.iterator(%i: index) = (%lb to %ub step %st) {
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index b908874c2010b..ba329cc67bb14 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -3601,6 +3601,24 @@ func.func @omp_iterator_2d(%s2 : !llvm.struct<(ptr, i64)>) -> () {
   return
 }
 
+// CHECK-LABEL: func.func @omp_iterator_negative_step
+func.func @omp_iterator_negative_step(%s2 : !llvm.struct<(ptr, i64)>) -> () {
+  // CHECK: %[[LB:.*]] = arith.constant 4 : index
+  // CHECK: %[[UB:.*]] = arith.constant 1 : index
+  // CHECK: %[[ST:.*]] = arith.constant -1 : index
+  // CHECK: %[[IT:.*]] = omp.iterator(%[[IV:.*]]: index) = (%[[LB]] to %[[UB]] step %[[ST]]) {
+  // CHECK:   omp.yield(%{{.*}} : !llvm.struct<(ptr, i64)>)
+  // CHECK: } -> !omp.iterated<!llvm.struct<(ptr, i64)>>
+  %lb = arith.constant 4 : index
+  %ub = arith.constant 1 : index
+  %st = arith.constant -1 : index
+
+  %0 = omp.iterator(%iv: index) = (%lb to %ub step %st) {
+    omp.yield(%s2 : !llvm.struct<(ptr, i64)>)
+  } -> !omp.iterated<!llvm.struct<(ptr, i64)>>
+  return
+}
+
 // CHECK-LABEL: func.func @omp_task_affinity_iterator_1d
 func.func @omp_task_affinity_iterator_1d(%lb : index, %ub : index, %step : index,
                                        %addr : !llvm.ptr, %len : i64) -> () {
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 3957455ccc76e..d8e08c8b2a850 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -2046,6 +2046,15 @@ func.func @load_non_pow_of_2_alignment(%memref: memref<4xi32>, %c0: index) {
 
 // -----
 
+func.func @load_non_unit_stride(%src : memref<?xi8, strided<[2], offset: ?>>) {
+  %c0 = arith.constant 0 : index
+  // expected-error @+1 {{'vector.load' op most minor memref dim must have unit stride}}
+  %0 = vector.load %src[%c0] : memref<?xi8, strided<[2], offset: ?>>, vector<16xi8>
+  return
+}
+
+// -----
+
 //===----------------------------------------------------------------------===//
 // vector.store
 //===----------------------------------------------------------------------===//
@@ -2073,6 +2082,13 @@ func.func @store_non_pow_of_2_alignment(%memref: memref<4xi32>, %val: vector<4xi
   return
 }
 
+// -----
+func.func @store_non_unit_stride(%src : memref<?xi8, strided<[2], offset:?>>,%val : vector<16xi8>, %c0: index) {
+  // expected-error @below {{'vector.store' op most minor memref dim must have unit stride}}
+  vector.store %val, %src[%c0] : memref<?xi8, strided<[2], offset: ?>>, vector<16xi8>
+  return
+}
+
 // -----
 
 // Verify that vector.bitcast rejects vectors with i0 (zero-bitwidth) element type.
diff --git a/mlir/test/Target/LLVMIR/llvmir-named-metadata.mlir b/mlir/test/Target/LLVMIR/llvmir-named-metadata.mlir
new file mode 100644
index 0000000000000..493616430c822
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/llvmir-named-metadata.mlir
@@ -0,0 +1,45 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// Tests LLVM named metadata translation with deeply nested metadata trees.
+
+// CHECK: !foo.version = !{![[VERSION:[0-9]+]]}
+// CHECK: !foo.language_version = !{![[LANG:[0-9]+]]}
+// CHECK: !foo.kernel = !{![[KERNEL:[0-9]+]]}
+
+llvm.func @my_kernel() {
+  llvm.return
+}
+
+llvm.named_metadata "foo.version" [
+  #llvm.md_node<#llvm.md_const<1 : i32>,
+                #llvm.md_const<0 : i32>,
+                #llvm.md_const<0 : i32>>
+]
+// CHECK-DAG: ![[VERSION]] = !{i32 1, i32 0, i32 0}
+
+llvm.named_metadata "foo.language_version" [
+  #llvm.md_node<#llvm.md_string<"Bar">,
+                #llvm.md_const<1 : i32>,
+                #llvm.md_const<2 : i32>,
+                #llvm.md_const<3 : i32>>
+]
+// CHECK-DAG: ![[LANG]] = !{!"Bar", i32 1, i32 2, i32 3}
+
+#buf0 = #llvm.md_node<
+  #llvm.md_const<0 : i32>, #llvm.md_string<"foo.buffer">,
+  #llvm.md_string<"foo.idx">, #llvm.md_const<0 : i32>,
+  #llvm.md_const<1 : i32>, #llvm.md_string<"foo.read">,
+  #llvm.md_string<"foo.address_space">, #llvm.md_const<1 : i32>,
+  #llvm.md_string<"foo.size">, #llvm.md_const<4 : i32>,
+  #llvm.md_string<"foo.align_size">, #llvm.md_const<4 : i32>>
+// CHECK-DAG: ![[A0:[0-9]+]] = !{i32 0, !"foo.buffer", !"foo.idx", i32 0, i32 1, !"foo.read", !"foo.address_space", i32 1, !"foo.size", i32 4, !"foo.align_size", i32 4}
+
+llvm.named_metadata "foo.kernel" [
+  #llvm.md_node<
+    #llvm.md_func<@my_kernel>,
+    #llvm.md_node<>,
+    #llvm.md_node<#buf0>>
+]
+// CHECK-DAG: ![[KERNEL]] = !{ptr @my_kernel, ![[EMPTY:[0-9]+]], ![[ARGS:[0-9]+]]}
+// CHECK-DAG: ![[EMPTY]] = !{}
+// CHECK-DAG: ![[ARGS]] = !{![[A0]]}
diff --git a/mlir/test/Target/LLVMIR/openmp-iterator.mlir b/mlir/test/Target/LLVMIR/openmp-iterator.mlir
new file mode 100644
index 0000000000000..faadfbdc7202f
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-iterator.mlir
@@ -0,0 +1,295 @@
+// RUN: mlir-translate --mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @task_affinity_iterator_1d(%arr: !llvm.ptr {llvm.nocapture}) {
+  %c1  = llvm.mlir.constant(1 : i64) : i64
+  %c4  = llvm.mlir.constant(4 : i64) : i64
+  %c6  = llvm.mlir.constant(6 : i64) : i64
+  %len = llvm.mlir.constant(4 : i64) : i64
+
+  omp.parallel {
+    omp.single {
+      %it = omp.iterator(%i: i64, %j: i64) =
+          (%c1 to %c4 step %c1, %c1 to %c6 step %c1) {
+        %entry = omp.affinity_entry %arr, %len
+            : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
+        omp.yield(%entry : !omp.affinity_entry_ty<!llvm.ptr, i64>)
+      } -> !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>
+
+      omp.task affinity(%it : !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @task_affinity_iterator_1d
+
+// Preheader -> Header
+// CHECK: omp_iterator.preheader:
+// CHECK: br label %omp_iterator.header
+//
+// Header has the IV phi and branches to cond
+// CHECK: omp_iterator.header:
+// CHECK: [[IV:%.*]] = phi i64 [ 0, %omp_iterator.preheader ], [ [[NEXT:%.*]], %omp_iterator.inc ]
+// CHECK: br label %omp_iterator.cond
+//
+// Cond: IV < 24 and branches to body or exit
+// CHECK: omp_iterator.cond:
+// CHECK: [[CMP:%.*]] = icmp ult i64 [[IV]], 24
+// CHECK: br i1 [[CMP]], label %omp_iterator.body, label %omp_iterator.exit
+//
+// Exit -> After -> continuation
+// CHECK: omp_iterator.exit:
+// CHECK: br label %omp_iterator.after
+// CHECK: omp_iterator.after:
+// CHECK: br label %omp.it.cont
+//
+// Body: store into affinity_list[IV] then branch to inc
+// CHECK: omp_iterator.body:
+// CHECK: [[ENTRY:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr %{{.*affinity_list.*}}, i64 [[IV]]
+// CHECK: [[ADDRI64:%.*]] = ptrtoint ptr %loadgep_ to i64
+// CHECK: [[ADDRGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 0
+// CHECK: store i64 [[ADDRI64]], ptr [[ADDRGEP]]
+// CHECK: [[LENGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 1
+// CHECK: store i64 4, ptr [[LENGEP]]
+// CHECK: [[FLAGGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 2
+// CHECK: store i32 0, ptr [[FLAGGEP]]
+// CHECK: br label %omp_iterator.inc
+//
+// CHECK: omp_iterator.inc:
+// CHECK: [[NEXT]] = add nuw i64 [[IV]], 1
+// CHECK: br label %omp_iterator.header
+
+llvm.func @task_affinity_iterator_3d(%arr: !llvm.ptr {llvm.nocapture}) {
+  %c1  = llvm.mlir.constant(1 : i64) : i64
+  %c2  = llvm.mlir.constant(2 : i64) : i64
+  %c4  = llvm.mlir.constant(4 : i64) : i64
+  %c6  = llvm.mlir.constant(6 : i64) : i64
+  %len = llvm.mlir.constant(4 : i64) : i64
+
+  omp.parallel {
+    omp.single {
+      // 3-D iterator: i=1..4, j=1..6, k=1..2 => total trips = 48
+      %it = omp.iterator(%i: i64, %j: i64, %k: i64) =
+          (%c1 to %c4 step %c1, %c1 to %c6 step %c1, %c1 to %c2 step %c1) {
+        %entry = omp.affinity_entry %arr, %len
+            : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
+        omp.yield(%entry : !omp.affinity_entry_ty<!llvm.ptr, i64>)
+      } -> !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>
+
+      omp.task affinity(%it : !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @task_affinity_iterator_3d
+
+// Preheader -> Header
+// CHECK: omp_iterator.preheader:
+// CHECK: br label %omp_iterator.header
+//
+// Header has the IV phi and branches to cond
+// CHECK: omp_iterator.header:
+// CHECK: [[IV:%.*]] = phi i64 [ 0, %omp_iterator.preheader ], [ [[NEXT:%.*]], %omp_iterator.inc ]
+// CHECK: br label %omp_iterator.cond
+//
+// Cond: IV < 48 and branches to body or exit
+// CHECK: omp_iterator.cond:
+// CHECK: [[CMP:%.*]] = icmp ult i64 [[IV]], 48
+// CHECK: br i1 [[CMP]], label %omp_iterator.body, label %omp_iterator.exit
+//
+// Exit -> After -> continuation
+// CHECK: omp_iterator.exit:
+// CHECK: br label %omp_iterator.after
+// CHECK: omp_iterator.after:
+// CHECK: br label %omp.it.cont
+//
+// Body: store into affinity_list[IV] then branch to inc
+// CHECK: omp_iterator.body:
+// CHECK: [[ENTRY:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr %{{.*affinity_list.*}}, i64 [[IV]]
+// CHECK: [[ADDRI64:%.*]] = ptrtoint ptr %loadgep_ to i64
+// CHECK: [[ADDRGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 0
+// CHECK: store i64 [[ADDRI64]], ptr [[ADDRGEP]]
+// CHECK: [[LENGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 1
+// CHECK: store i64 4, ptr [[LENGEP]]
+// CHECK: [[FLAGGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 2
+// CHECK: store i32 0, ptr [[FLAGGEP]]
+// CHECK: br label %omp_iterator.inc
+//
+// CHECK: omp_iterator.inc:
+// CHECK: [[NEXT]] = add nuw i64 [[IV]], 1
+// CHECK: br label %omp_iterator.header
+
+llvm.func @task_affinity_iterator_multiple(%arr: !llvm.ptr {llvm.nocapture}) {
+  %c1  = llvm.mlir.constant(1 : i64) : i64
+  %c3  = llvm.mlir.constant(3 : i64) : i64
+  %c4  = llvm.mlir.constant(4 : i64) : i64
+  %c6  = llvm.mlir.constant(6 : i64) : i64
+  %len = llvm.mlir.constant(4 : i64) : i64
+
+  omp.parallel {
+    omp.single {
+      // First iterator: 2-D (4 * 6 = 24)
+      %it0 = omp.iterator(%i: i64, %j: i64) =
+          (%c1 to %c4 step %c1, %c1 to %c6 step %c1) {
+        %entry0 = omp.affinity_entry %arr, %len
+            : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
+        omp.yield(%entry0 : !omp.affinity_entry_ty<!llvm.ptr, i64>)
+      } -> !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>
+
+      // second iterator: 1-D (3)
+      %it1 = omp.iterator(%k: i64) = (%c1 to %c3 step %c1) {
+        %entry1 = omp.affinity_entry %arr, %len
+            : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
+        omp.yield(%entry1 : !omp.affinity_entry_ty<!llvm.ptr, i64>)
+      } -> !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>
+
+      // Multiple iterators in a single affinity clause.
+      omp.task affinity(%it0: !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>,
+            %it1: !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>) {
+        omp.terminator
+      }
+
+      omp.terminator
+    }
+    omp.terminator
+  }
+
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @task_affinity_iterator_multiple
+// CHECK-DAG: [[AFFLIST0:%.*]] = alloca { i64, i64, i32 }, i64 24, align 8
+// CHECK-DAG: [[AFFLIST1:%.*]] = alloca { i64, i64, i32 }, i64 3, align 8
+// CHECK-DAG: [[AFFINITY_LIST:%.*]] = alloca { i64, i64, i32 }, i32 27, align 8
+
+// First iterator header
+// CHECK: omp_iterator.preheader:
+// CHECK: br label %[[HEADER0:.+]]
+// CHECK: [[HEADER0]]:
+// CHECK: [[IV0:%.*]] = phi i64 [ 0, %omp_iterator.preheader ], [ [[NEXT0:%.*]], %[[INC0:.+]] ]
+// CHECK: br label %[[COND0:.+]]
+// CHECK: [[COND0]]:
+// CHECK: [[CMP0:%.*]] = icmp ult i64 [[IV0]], 24
+// CHECK: br i1 [[CMP0]], label %[[BODY0:.+]], label %omp_iterator.exit
+
+// Second iterator header
+// CHECK: omp_iterator.preheader{{.*}}:
+// CHECK: [[HEADER1:.+]]:
+// CHECK: [[IV1:%.*]] = phi i64 [ 0, %omp_iterator.preheader{{.*}} ], [ [[NEXT1:%.*]], %[[INC1:.+]] ]
+// CHECK: br label %omp_iterator.cond{{.*}}
+// CHECK: omp_iterator.cond{{.*}}:
+// CHECK: [[CMP1:%.*]] = icmp ult i64 [[IV1]], 3
+// CHECK: br i1 [[CMP1]], label %[[BODY1:.+]], label %omp_iterator.exit{{.*}}
+
+// CHECK: [[AFFINITY_LIST_1:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr [[AFFINITY_LIST]], i64 0
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[AFFINITY_LIST_1]], ptr align 1 [[AFFLIST0]], i64 480, i1 false)
+// CHECK: [[AFFINITY_LIST_2:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr [[AFFINITY_LIST]], i64 24
+// CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[AFFINITY_LIST_2]], ptr align 1 [[AFFLIST1]], i64 60, i1 false)
+// CHECK: codeRepl:
+// CHECK: call ptr @__kmpc_omp_task_alloc
+// CHECK: call i32 @__kmpc_omp_reg_task_with_affinity{{.*}}i32 27{{.*}}ptr [[AFFINITY_LIST]]
+// CHECK: call i32 @__kmpc_omp_task
+
+// Second iterator body
+// CHECK: [[BODY1]]:
+// CHECK: [[ENTRY1:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr [[AFFLIST1]]
+// CHECK: [[ADDR1:%.*]] = ptrtoint ptr %loadgep_ to i64
+// CHECK: [[ADDRGEP1:%.*]] = getelementptr inbounds{{.*}} { i64, i64, i32 }, ptr [[ENTRY1]], i32 0, i32 0
+// CHECK: store i64 [[ADDR1]], ptr [[ADDRGEP1]]
+// CHECK: [[LENGEP1:%.*]] = getelementptr inbounds{{.*}} { i64, i64, i32 }, ptr [[ENTRY1]], i32 0, i32 1
+// CHECK: store i64 4, ptr [[LENGEP1]]
+// CHECK: [[FLAGGEP1:%.*]] = getelementptr inbounds{{.*}} { i64, i64, i32 }, ptr [[ENTRY1]], i32 0, i32 2
+// CHECK: store i32 0, ptr [[FLAGGEP1]]
+// CHECK: br label %[[INC1]]
+// CHECK: [[INC1]]:
+// CHECK: [[NEXT1]] = add nuw i64 [[IV1]], 1
+// CHECK: br label %[[HEADER1]]
+
+// First iterator body
+// CHECK: [[BODY0]]:
+// CHECK: [[ENTRY0:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr [[AFFLIST0]], i64 [[IV0]]
+// CHECK: [[ADDR0:%.*]] = ptrtoint ptr %loadgep_ to i64
+// CHECK: [[ADDRGEP0:%.*]] = getelementptr inbounds{{.*}} { i64, i64, i32 }, ptr [[ENTRY0]], i32 0, i32 0
+// CHECK: store i64 [[ADDR0]], ptr [[ADDRGEP0]]
+// CHECK: [[LENGEP0:%.*]] = getelementptr inbounds{{.*}} { i64, i64, i32 }, ptr [[ENTRY0]], i32 0, i32 1
+// CHECK: store i64 4, ptr [[LENGEP0]]
+// CHECK: [[FLAGGEP0:%.*]] = getelementptr inbounds{{.*}} { i64, i64, i32 }, ptr [[ENTRY0]], i32 0, i32 2
+// CHECK: store i32 0, ptr [[FLAGGEP0]]
+// CHECK: br label %[[INC0]]
+// CHECK: [[INC0]]:
+// CHECK: [[NEXT0]] = add nuw i64 [[IV0]], 1
+// CHECK: br label %[[HEADER0]]
+
+// Makes sure affinity list only created after dynamic count
+llvm.func @task_affinity_iterator_dynamic_tripcount(
+    %arr: !llvm.ptr {llvm.nocapture}, %lb: i64, %ub: i64, %step: i64,
+    %len: i64) {
+  omp.parallel {
+    omp.single {
+      %it = omp.iterator(%i: i64) = (%lb to %ub step %step) {
+        %entry = omp.affinity_entry %arr, %len
+            : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
+        omp.yield(%entry : !omp.affinity_entry_ty<!llvm.ptr, i64>)
+      } -> !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>
+
+      omp.task affinity(%it : !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @task_affinity_iterator_dynamic_tripcount
+// CHECK: [[DIFF:%.*]] = sub i64 {{.*}}, {{.*}}
+// CHECK: [[DIV:%.*]] = sdiv i64 [[DIFF]], {{.*}}
+// CHECK: [[TRIPS:%.*]] = add i64 [[DIV]], 1
+// CHECK: [[SCALED:%.*]] = mul i64 1, [[TRIPS]]
+// CHECK: [[AFFLIST:%.*]] = alloca { i64, i64, i32 }, i64 [[SCALED]]
+
+llvm.func @task_affinity_iterator_negative_step(%arr: !llvm.ptr {llvm.nocapture}) {
+  %c4 = llvm.mlir.constant(4 : i64) : i64
+  %c1 = llvm.mlir.constant(1 : i64) : i64
+  %cn1 = llvm.mlir.constant(-1 : i64) : i64
+
+  omp.parallel {
+    omp.single {
+      %it = omp.iterator(%i: i64) = (%c4 to %c1 step %cn1) {
+        %entry = omp.affinity_entry %arr, %i
+            : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
+        omp.yield(%entry : !omp.affinity_entry_ty<!llvm.ptr, i64>)
+      } -> !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>
+
+      omp.task affinity(%it : !omp.iterated<!omp.affinity_entry_ty<!llvm.ptr, i64>>) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @task_affinity_iterator_negative_step
+// CHECK: [[AFFLIST:%.*]] = alloca { i64, i64, i32 }, i64 4, align 8
+// CHECK: omp_iterator.cond:
+// CHECK: [[CMP:%.*]] = icmp ult i64 %omp_iterator.iv, 4
+// CHECK: br i1 [[CMP]], label %omp_iterator.body, label %omp_iterator.exit
+// CHECK: omp_iterator.body:
+// CHECK: [[IDX:%.*]] = urem i64 %omp_iterator.iv, 4
+// CHECK: [[STEPMUL:%.*]] = mul i64 [[IDX]], -1
+// CHECK: [[PHYSIV:%.*]] = add i64 4, [[STEPMUL]]
+// CHECK: [[ENTRY:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr [[AFFLIST]], i64 %omp_iterator.iv
+// CHECK: [[LENPTR:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 1
+// CHECK: store i64 [[PHYSIV]], ptr [[LENPTR]]
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
index fcb937dbc1867..c5cdecd091770 100644
--- a/mlir/test/Target/LLVMIR/openmp-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -3589,3 +3589,37 @@ llvm.func @nested_task_with_deps() {
 
 // CHECK:         ret void
 // CHECK:       }
+
+llvm.func @task_affinity_plain(%arr: !llvm.ptr {llvm.nocapture}) {
+  %len = llvm.mlir.constant(4 : i64) : i64
+
+  omp.parallel {
+    omp.single {
+      %ae = omp.affinity_entry %arr, %len
+        : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
+
+      omp.task affinity(%ae : !omp.affinity_entry_ty<!llvm.ptr, i64>) {
+        omp.terminator
+      }
+      omp.terminator
+    }
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK-LABEL: define internal void @task_affinity_plain
+// CHECK: [[BASE:%.*]] = load ptr, ptr %gep_, align 8
+// CHECK: [[AFFLIST:%.*]] = alloca { i64, i64, i32 }, i64 1, align 8
+// CHECK: [[ENTRY:%.*]] = getelementptr inbounds { i64, i64, i32 }, ptr [[AFFLIST]], i64 0
+// addr
+// CHECK: [[ADDRI64:%.*]] = ptrtoint ptr [[BASE]] to i64
+// CHECK: [[ADDRGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 0
+// CHECK: store i64 [[ADDRI64]], ptr [[ADDRGEP]]
+// len
+// CHECK: [[LENGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 1
+// CHECK: store i64 4, ptr [[LENGEP]]
+// flags is always 0
+// CHECK: [[FLAGGEP:%.*]] = getelementptr inbounds nuw { i64, i64, i32 }, ptr [[ENTRY]], i32 0, i32 2
+// CHECK: store i32 0, ptr [[FLAGGEP]]
+// CHECK: call i32 @__kmpc_omp_reg_task_with_affinity{{.*}}i32 1, ptr [[AFFLIST]]
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 2500d546fcf4d..8fb66cb4dd0eb 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -462,15 +462,3 @@ llvm.func @wsloop_order(%lb : i32, %ub : i32, %step : i32) {
   }
   llvm.return
 }
-
-// -----
-llvm.func @task_affinity(%ptr : !llvm.ptr, %len : i64) {
-  // expected-error at below {{not yet implemented: omp.affinity_entry}}
-  // expected-error at below {{LLVM Translation failed for operation: omp.affinity_entry}}
-  %ae = omp.affinity_entry %ptr, %len
-    : (!llvm.ptr, i64) -> !omp.affinity_entry_ty<!llvm.ptr, i64>
-  omp.task affinity(%ae : !omp.affinity_entry_ty<!llvm.ptr, i64>) {
-    omp.terminator
-  }
-  llvm.return
-}
diff --git a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
index bd825c858a615..ed5dc5bead78a 100644
--- a/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
+++ b/mlir/test/lib/Dialect/Test/TestOpDefs.cpp
@@ -1331,6 +1331,15 @@ TestCrashingReturnOp::getMutableSuccessorOperands(RegionSuccessor successor) {
   return getArgsMutable();
 }
 
+//===----------------------------------------------------------------------===//
+// TestReturnWithIgnoredValueOp
+//===----------------------------------------------------------------------===//
+
+MutableOperandRange TestReturnWithIgnoredValueOp::getMutableSuccessorOperands(
+    RegionSuccessor /*successor*/) {
+  return getValuesMutable();
+}
+
 //===----------------------------------------------------------------------===//
 // SwitchWithNoBreakOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 02bac016eeed1..4c9e6b3fe9e45 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2244,7 +2244,8 @@ def TestReturnOp : TEST_Op<"return", [Pure, ReturnLike, Terminator]> {
 // emits an error, but getMutableSuccessorOperands() calls report_fatal_error
 // to expose the fact that FuncOp::verify() runs before the region is checked.
 def TestCrashingReturnOp : TEST_Op<"crashing_return", [
-    DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>,
+    DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface,
+       ["getMutableSuccessorOperands"]>,
     Terminator]> {
   let arguments = (ins Variadic<AnyType>:$args, UnitAttr:$valid);
   let assemblyFormat = "($args^ `:` type($args))? attr-dict";
@@ -2300,6 +2301,19 @@ def TestSignatureConversionNoConverterOp
   let regions = (region AnyRegion);
 }
 
+// A return-like operation with override on the successor operand getter
+// interface method.
+def TestReturnWithIgnoredValueOp : TEST_Op<"return_with_ignored_value", [
+    Pure,
+    ReturnLike,
+    DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface, [
+      "getMutableSuccessorOperands"
+    ]>,
+    Terminator
+]> {
+  let arguments = (ins Variadic<AnyType>:$values, AnyType:$unwanted_value);
+}
+
 //===----------------------------------------------------------------------===//
 // Test parser.
 //===----------------------------------------------------------------------===//
@@ -2707,8 +2721,9 @@ def LoopBlockOp : TEST_Op<"loop_block",
 }
 
 def LoopBlockTerminatorOp : TEST_Op<"loop_block_term",
-    [DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface>, Pure,
-     Terminator]> {
+    [DeclareOpInterfaceMethods<RegionBranchTerminatorOpInterface,
+        ["getMutableSuccessorOperands"]>,
+     Pure, Terminator]> {
   let arguments = (ins I32:$nextIterArg, F32:$exitArg);
 
   let assemblyFormat = [{
diff --git a/mlir/test/mlir-opt/expected-unknown-loc-unmatched.mlir b/mlir/test/mlir-opt/expected-unknown-loc-unmatched.mlir
new file mode 100644
index 0000000000000..5a06d386dabf8
--- /dev/null
+++ b/mlir/test/mlir-opt/expected-unknown-loc-unmatched.mlir
@@ -0,0 +1,9 @@
+// Test that an unmatched `expected-*` directive using the `@unknown` location
+// specifier emits a diagnostic instead of crashing.
+// See https://github.com/llvm/llvm-project/issues/163343
+
+// RUN: not mlir-opt --verify-diagnostics %s 2>&1 | FileCheck %s
+
+// CHECK: expected warning "some warning that is never produced" was not produced
+
+// expected-warning @unknown {{some warning that is never produced}}
diff --git a/mlir/test/python/dialects/llvm.py b/mlir/test/python/dialects/llvm.py
index 7a2b8e1809c47..1ed77cf3b84b3 100644
--- a/mlir/test/python/dialects/llvm.py
+++ b/mlir/test/python/dialects/llvm.py
@@ -215,3 +215,140 @@ def testTranslateToLLVMIR():
         # CHECK:   ret i64 %3
         # CHECK: }
         print(llvm.translate_module_to_llvmir(module.operation))
+
+
+# CHECK-LABEL: testMetadataAttrs
+ at constructAndPrintInModule
+def testMetadataAttrs():
+    # MDStringAttr
+    md_str = llvm.MDStringAttr.get("foo.buffer")
+    # CHECK: #llvm.md_string<"foo.buffer">
+    print(md_str)
+    assert md_str.value == "foo.buffer"
+
+    # MDConstantAttr
+    i32 = IntegerType.get_signless(32)
+    md_const = llvm.MDConstantAttr.get(IntegerAttr.get(i32, 42))
+    # CHECK: #llvm.md_const<42 : i32>
+    print(md_const)
+
+    # MDFuncAttr
+    md_func = llvm.MDFuncAttr.get("my_kernel")
+    # CHECK: #llvm.md_func<@my_kernel>
+    print(md_func)
+    assert md_func.name == "my_kernel"
+
+    # MDNodeAttr - empty
+    md_empty = llvm.MDNodeAttr.get([])
+    # CHECK: #llvm.md_node<>
+    print(md_empty)
+    assert len(md_empty) == 0
+
+    # MDNodeAttr - with operands
+    md_node = llvm.MDNodeAttr.get([md_const, md_str])
+    # CHECK: #llvm.md_node<#llvm.md_const<42 : i32>, #llvm.md_string<"foo.buffer">>
+    print(md_node)
+    assert len(md_node) == 2
+
+    # MDNodeAttr - __getitem__
+    # CHECK: #llvm.md_const<42 : i32>
+    print(md_node[0])
+    # CHECK: #llvm.md_string<"foo.buffer">
+    print(md_node[1])
+    assert str(md_node[0]) == str(md_const)
+    assert str(md_node[1]) == str(md_str)
+
+    # MDNodeAttr - nested
+    md_nested = llvm.MDNodeAttr.get([md_node, md_empty])
+    # CHECK: #llvm.md_node<#llvm.md_node<#llvm.md_const<42 : i32>, #llvm.md_string<"foo.buffer">>, #llvm.md_node<>>
+    print(md_nested)
+    assert len(md_nested) == 2
+
+
+# CHECK-LABEL: testNamedMetadata
+ at constructAndPrintInModule
+def testNamedMetadata():
+    void = Type.parse("!llvm.void")
+    func_ty = llvm.FunctionType.get(void, [])
+
+    llvm.LLVMFuncOp("my_kernel", TypeAttr.get(func_ty))
+    # CHECK-LABEL:   llvm.func @my_kernel()
+
+    llvm.NamedMetadataOp(
+        metadata_name="foo.version",
+        nodes=ArrayAttr.get(
+            [
+                llvm.MDNodeAttr.get(
+                    [llvm.md_const(1), llvm.md_const(0), llvm.md_const(0)]
+                )
+            ]
+        ),
+    )
+    # CHECK: llvm.named_metadata "foo.version" [#llvm.md_node<#llvm.md_const<1 : i32>, #llvm.md_const<0 : i32>, #llvm.md_const<0 : i32>>]
+
+    llvm.NamedMetadataOp(
+        metadata_name="foo.language_version",
+        nodes=ArrayAttr.get(
+            [
+                llvm.MDNodeAttr.get(
+                    [
+                        llvm.md_str("Bar"),
+                        llvm.md_const(1),
+                        llvm.md_const(2),
+                        llvm.md_const(3),
+                    ]
+                )
+            ]
+        ),
+    )
+    # CHECK: llvm.named_metadata "foo.language_version" [#llvm.md_node<#llvm.md_string<"Bar">, #llvm.md_const<1 : i32>, #llvm.md_const<2 : i32>, #llvm.md_const<3 : i32>>]
+
+    buf0 = llvm.MDNodeAttr.get(
+        [
+            llvm.md_const(0),
+            llvm.md_str("foo.buffer"),
+            llvm.md_str("foo.idx"),
+            llvm.md_const(0),
+            llvm.md_const(1),
+            llvm.md_str("foo.read"),
+            llvm.md_str("foo.address_space"),
+            llvm.md_const(1),
+            llvm.md_str("foo.size"),
+            llvm.md_const(4),
+            llvm.md_str("foo.align_size"),
+            llvm.md_const(4),
+        ]
+    )
+
+    llvm.NamedMetadataOp(
+        metadata_name="foo.kernel",
+        nodes=ArrayAttr.get(
+            [
+                llvm.MDNodeAttr.get(
+                    [
+                        llvm.MDFuncAttr.get("my_kernel"),
+                        llvm.MDNodeAttr.get([]),
+                        buf0,
+                    ]
+                )
+            ]
+        ),
+    )
+    # CHECK:       llvm.named_metadata "foo.kernel" [
+    # CHECK-SAME:  #llvm.md_node<
+    # CHECK-SAME:      #llvm.md_func<@my_kernel>,
+    # CHECK-SAME:      #llvm.md_node<>,
+    # CHECK-SAME:      #llvm.md_node<
+    # CHECK-SAME:          #llvm.md_const<0 : i32>,
+    # CHECK-SAME:          #llvm.md_string<"foo.buffer">,
+    # CHECK-SAME:          #llvm.md_string<"foo.idx">,
+    # CHECK-SAME:          #llvm.md_const<0 : i32>,
+    # CHECK-SAME:          #llvm.md_const<1 : i32>,
+    # CHECK-SAME:          #llvm.md_string<"foo.read">,
+    # CHECK-SAME:          #llvm.md_string<"foo.address_space">,
+    # CHECK-SAME:          #llvm.md_const<1 : i32>,
+    # CHECK-SAME:          #llvm.md_string<"foo.size">,
+    # CHECK-SAME:          #llvm.md_const<4 : i32>,
+    # CHECK-SAME:          #llvm.md_string<"foo.align_size">,
+    # CHECK-SAME:          #llvm.md_const<4 : i32>>
+    # CHECK-SAME:    >]
diff --git a/mlir/test/python/dialects/transform_op_interface.py b/mlir/test/python/dialects/transform_op_interface.py
index f58e0be13befd..a6e2c6da45322 100644
--- a/mlir/test/python/dialects/transform_op_interface.py
+++ b/mlir/test/python/dialects/transform_op_interface.py
@@ -16,7 +16,6 @@
 )
 
 
- at ext.register_dialect
 class MyTransform(ext.Dialect, name="my_transform"):
     pass
 
@@ -26,7 +25,7 @@ def run(emit_schedule):
     with ir.Context() as ctx, ir.Location.unknown():
         payload = emit_payload()
 
-        MyTransform.load(register=False, reload=True)
+        MyTransform.load(reload=True)
 
         GetNamedAttributeOp.attach_interface_impls(ctx)
         PrintParamOp.attach_interface_impls(ctx)
@@ -86,7 +85,6 @@ def get_effects(op: ir.Operation, effects):
 
 # Demonstration of a TransformOpInterface-implementing op that gets named attributes
 # from target ops and produces them as param handles.
- at ext.register_operation(MyTransform)
 class GetNamedAttributeOp(MyTransform.Operation, name="get_named_attribute"):
     target: ext.Operand[transform.AnyOpType]
     attr_name: ir.StringAttr
@@ -120,7 +118,6 @@ def allow_repeated_handle_operands(_op: "GetNamedAttributeOp") -> bool:
             return False
 
 
- at ext.register_operation(MyTransform)
 class PrintParamOp(MyTransform.Operation, name="print_param"):
     target: ext.Operand[transform.AnyParamType]
     name: ir.StringAttr
@@ -150,7 +147,6 @@ def allow_repeated_handle_operands(_op: "GetNamedAttributeOp") -> bool:
 
 
 # Syntax for an op with one op handle operand and one op handle result.
- at ext.register_operation(MyTransform)
 class OneOpInOneOpOut(MyTransform.Operation, name="one_op_in_one_op_out"):
     target: ext.Operand[transform.AnyOpType]
     res: ext.Result[transform.AnyOpType[()]]
@@ -273,7 +269,6 @@ def get_effects(op: ir.Operation, effects):
     return schedule
 
 
- at ext.register_operation(MyTransform)
 class OpValParamInParamOpValOut(
     MyTransform.Operation, name="op_val_param_in_param_op_val_out"
 ):
@@ -378,7 +373,6 @@ def allow_repeated_handle_operands(_op: OpValParamInParamOpValOut) -> bool:
     return schedule
 
 
- at ext.register_operation(MyTransform)
 class OpsParamsInValuesParamOut(
     MyTransform.Operation, name="ops_params_in_values_param_out"
 ):
diff --git a/mlir/test/python/dialects/transform_pattern_descriptor_op_interface.py b/mlir/test/python/dialects/transform_pattern_descriptor_op_interface.py
index 470c679179b03..9cd73331cfdea 100644
--- a/mlir/test/python/dialects/transform_pattern_descriptor_op_interface.py
+++ b/mlir/test/python/dialects/transform_pattern_descriptor_op_interface.py
@@ -7,7 +7,6 @@
 from mlir.dialects.transform import AnyOpType, structured
 
 
- at ext.register_dialect
 class MyPatternDescriptors(ext.Dialect, name="my_pattern_descriptors"):
     pass
 
@@ -17,7 +16,7 @@ def run(emit_schedule):
     with ir.Context(), ir.Location.unknown():
         payload = emit_payload()
 
-        MyPatternDescriptors.load(register=False, reload=True)
+        MyPatternDescriptors.load(reload=True)
 
         # NB: Pattern descriptor ops have their interfaces attached
         #     in their respective test functions.
@@ -58,7 +57,6 @@ def schedule_boilerplate():
             yield schedule, named_sequence
 
 
- at ext.register_operation(MyPatternDescriptors)
 class SubiAddiRewritePatternOp(MyPatternDescriptors.Operation, name="add_pattern"):
     @classmethod
     def attach_interface_impls(cls, ctx=None):
diff --git a/mlir/unittests/Analysis/Presburger/MatrixTest.cpp b/mlir/unittests/Analysis/Presburger/MatrixTest.cpp
index e2c2a9bcb7d26..d40760fa8d894 100644
--- a/mlir/unittests/Analysis/Presburger/MatrixTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/MatrixTest.cpp
@@ -258,6 +258,57 @@ TEST(MatrixTest, computeHermiteNormalForm) {
   }
 }
 
+static void checkSmithNormalForm(const IntMatrix &mat) {
+  auto [u, d, v] = mat.computeSmithNormalForm();
+
+  // Check u and v are unimodular.
+  EXPECT_EQ(llvm::abs(u.determinant()), 1);
+  EXPECT_EQ(llvm::abs(v.determinant()), 1);
+
+  // Check u @ mat @ v = d (@ is matrix multiplication).
+  EXPECT_EQ(u.postMultiply(mat).postMultiply(v), d);
+
+  // Check d is diagonal, i.e. non-diagonal elements are zero.
+  for (unsigned i = 0, e = d.getNumRows(); i < e; i++) {
+    for (unsigned j = 0, f = d.getNumColumns(); j < f; j++) {
+      if (i != j)
+        EXPECT_EQ(d(i, j), 0);
+    }
+  }
+
+  // Check d(i, i) divides d(i + 1, i + 1).
+  unsigned end = std::min(d.getNumRows(), d.getNumColumns()) - 1;
+  unsigned i = 0;
+  for (; i < end; i++) {
+    if (d(i, i) == 0)
+      break;
+
+    EXPECT_EQ(d(i + 1, i + 1) % d(i, i), 0);
+  }
+  for (; i < end; i++)
+    EXPECT_EQ(d(i, i), 0);
+}
+
+TEST(MatrixTest, computeSmithNormalForm) {
+  {
+    IntMatrix mat =
+        makeIntMatrix(4, 3, {{2, 5, 8}, {3, 6, 9}, {4, 7, 1}, {5, 8, 2}});
+    checkSmithNormalForm(mat);
+  }
+
+  {
+    // Smith normal form of this matrix has trailing zeroes on the diagonal.
+    IntMatrix mat = makeIntMatrix(2, 3, {{6, 4, 2}, {3, 2, 0}});
+    checkSmithNormalForm(mat);
+  }
+
+  {
+    // 1x1 edge case.
+    IntMatrix mat = makeIntMatrix(1, 1, {{9}});
+    checkSmithNormalForm(mat);
+  }
+}
+
 TEST(MatrixTest, inverse) {
   IntMatrix mat1 = makeIntMatrix(2, 2, {{2, 1}, {7, 0}});
   EXPECT_EQ(mat1.determinant(), -7);
diff --git a/offload/cmake/caches/AMDGPUBot.cmake b/offload/cmake/caches/AMDGPUBot.cmake
index 5a27a81c736b1..11b2bbb16b210 100644
--- a/offload/cmake/caches/AMDGPUBot.cmake
+++ b/offload/cmake/caches/AMDGPUBot.cmake
@@ -17,6 +17,8 @@ set(LLVM_ENABLE_ASSERTIONS ON CACHE BOOL "")
 set(LLVM_TARGETS_TO_BUILD "host;AMDGPU;SPIRV" CACHE STRING "")
 set(LLVM_LIT_ARGS "-v --show-unsupported --timeout 100 --show-xfail -j 16" CACHE STRING "")
 
+set(LIBOMPTEST_BUILD_UNITTESTS ON CACHE BOOL "")
+
 set(CLANG_DEFAULT_LINKER "lld" CACHE STRING "")
 set(CLANG_DEFAULT_RTLIB "compiler-rt" STRING "")
 
diff --git a/openmp/runtime/src/include/omp-tools.h.var b/openmp/runtime/src/include/omp-tools.h.var
index 471f46a9073ee..e727fa3ebad28 100644
--- a/openmp/runtime/src/include/omp-tools.h.var
+++ b/openmp/runtime/src/include/omp-tools.h.var
@@ -1402,6 +1402,7 @@ typedef struct ompt_record_ompt_t {
     ompt_record_target_map_t target_map;
     ompt_record_target_kernel_t target_kernel;
     ompt_record_control_tool_t control_tool;
+    ompt_record_error_t error;
   } record;
 } ompt_record_ompt_t;
 
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 37836fb457537..ae2d617c3ea40 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -1505,6 +1505,18 @@ kmp_int32
 __kmpc_omp_reg_task_with_affinity(ident_t *loc_ref, kmp_int32 gtid,
                                   kmp_task_t *new_task, kmp_int32 naffins,
                                   kmp_task_affinity_info_t *affin_list) {
+  if (naffins > 0)
+    KMP_DEBUG_ASSERT(affin_list != NULL);
+
+  for (kmp_int32 i = 0; i < naffins; ++i) {
+    KA_TRACE(30, ("__kmpc_omp_reg_task_with_affinity: T#%d aff[%d] "
+                  "base_addr=0x%llx len=%zu flags={%d,%d,%d}\n",
+                  gtid, i, (unsigned long long)affin_list[i].base_addr,
+                  affin_list[i].len, (int)affin_list[i].flags.flag1,
+                  (int)affin_list[i].flags.flag2,
+                  (int)affin_list[i].flags.reserved));
+  }
+
   return 0;
 }
 
diff --git a/orc-rt/docs/Design.md b/orc-rt/docs/Design.md
index 45d42d53c809e..dec0e11eebf05 100644
--- a/orc-rt/docs/Design.md
+++ b/orc-rt/docs/Design.md
@@ -21,8 +21,8 @@ process.
 ### Session
 
 The Session object is the root object for a JIT'd program. It owns the
-ResourceManager instances that manage resources supporting JIT'd code (e.g.
-JIT'd memory, unwind info registrations, dynamic library handles, etc.).
+Service instances that manage services and resources supporting JIT'd code
+(e.g. JIT'd memory, unwind info registrations, dynamic library handles, etc.).
 
 The Session object must be constructed prior to adding any JIT'd code, and must
 outlive execution of any JIT'd code.
@@ -50,16 +50,17 @@ ControllerAccess objects may be detached before the session ends, at which point
 JIT'd code may continue executing, but will receive no further calls from the
 controller and can make no further calls to the controller.
 
-### ResourceManager
+### Service
 
-`ResourceManager` is an interface for classes that manage resources that support
-a JIT'd program, for example memory or loaded dylib handles. It provides two
-operations: `detach` and `shutdown`. The `shutdown` operation will be called at
-`Session` destruction time. The `detach` operation may be called if the
-controller detaches: since this means that no further requests for resource
-allocation or release will occur prior to the end of the Session
-ResourceManagers may implement this operation to abandon any fine-grained
-tracking or pre-reserved resources (e.g. address space).
+`Service` is an interface for classes that provide services to the Session.
+E.g. memory managers, or dynamic library loaders.
+
+The `Service` interface provides two operations: `detach` and `shutdown`. The
+`shutdown` operation will be called at `Session` destruction time. The `detach`
+operation will be called if the controller detaches. Since this means that no
+further requests for service will be made by the controller, Services may
+implement this operation to abandon any fine-grained book-keeping that is
+needed to provide ongoing services to the controller.
 
 ### TaskDispatcher
 
diff --git a/orc-rt/include/CMakeLists.txt b/orc-rt/include/CMakeLists.txt
index 8e68e8b04ae3b..e16941b9d24f8 100644
--- a/orc-rt/include/CMakeLists.txt
+++ b/orc-rt/include/CMakeLists.txt
@@ -7,20 +7,24 @@ set(ORC_RT_HEADERS
     orc-rt/AllocAction.h
     orc-rt/BitmaskEnum.h
     orc-rt/Compiler.h
+    orc-rt/ControllerInterface.h
     orc-rt/Error.h
     orc-rt/ExecutorAddress.h
     orc-rt/IntervalMap.h
     orc-rt/IntervalSet.h
+    orc-rt/LockedAccess.h
     orc-rt/Math.h
     orc-rt/MemoryFlags.h
     orc-rt/QueueingTaskDispatcher.h
-    orc-rt/ResourceManager.h
     orc-rt/RTTI.h
     orc-rt/ScopeExit.h
+    orc-rt/Service.h
     orc-rt/Session.h
     orc-rt/SimpleNativeMemoryMap.h
     orc-rt/SimplePackedSerialization.h
     orc-rt/SPSAllocAction.h
+    orc-rt/sps-ci/AllSPSCI.h
+    orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h
     orc-rt/SPSMemoryFlags.h
     orc-rt/SPSWrapperFunction.h
     orc-rt/SPSWrapperFunctionBuffer.h
diff --git a/orc-rt/include/orc-rt/ControllerInterface.h b/orc-rt/include/orc-rt/ControllerInterface.h
new file mode 100644
index 0000000000000..54b310bf9e6e9
--- /dev/null
+++ b/orc-rt/include/orc-rt/ControllerInterface.h
@@ -0,0 +1,81 @@
+//===--- ControllerInterface.h -- Controller Interface Symtab ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Controller interface symbol table.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_CONTROLLERINTERFACE_H
+#define ORC_RT_CONTROLLERINTERFACE_H
+
+#include "orc-rt/Error.h"
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+#define ORC_RT_SYMTAB_PAIR(sym) {{#sym}, reinterpret_cast<const void *>(&sym)}
+
+namespace orc_rt {
+
+/// A symbol table defining the interface exposed by the ORC runtime to the
+/// controller. Symbols are added via addSymbolsUnique, which rejects
+/// duplicates with an error.
+class ControllerInterface {
+public:
+  using SymbolTable = std::unordered_map<std::string, const void *>;
+  using iterator = SymbolTable::const_iterator;
+
+  bool empty() const noexcept { return Symbols.empty(); }
+  size_t size() const noexcept { return Symbols.size(); }
+  iterator begin() const noexcept { return Symbols.begin(); }
+  iterator end() const noexcept { return Symbols.end(); }
+
+  template <typename KeyT> decltype(auto) count(KeyT &&K) const {
+    return Symbols.count(std::forward<KeyT>(K));
+  }
+
+  template <typename KeyT> decltype(auto) at(KeyT &&K) const {
+    return Symbols.at(std::forward<KeyT>(K));
+  }
+
+  /// Adds symbol/address pairs from NewSymbols, first checking that all
+  /// symbols in NewSymbols are unique (i.e. not previously defined).
+  ///
+  /// NewSymbols must not contain any internal duplicates.
+  template <typename SymbolRangeT>
+  Error addSymbolsUnique(SymbolRangeT &&NewSymbols) {
+
+    // First check for duplicates, error out if any are found.
+    {
+      std::vector<std::string_view> Dups;
+      for (auto &[Name, Addr] : NewSymbols)
+        if (Symbols.count(Name))
+          Dups.push_back(Name);
+      if (!Dups.empty())
+        return makeDuplicatesError(std::move(Dups));
+    }
+
+    // No duplicates. Add entries.
+    for (auto &P : NewSymbols) {
+      [[maybe_unused]] bool Added = Symbols.insert(P).second;
+      assert(Added && "NewSymbols contains duplicate definitions");
+    }
+
+    return Error::success();
+  }
+
+private:
+  static Error makeDuplicatesError(std::vector<std::string_view> Dups);
+
+  SymbolTable Symbols;
+};
+
+} // namespace orc_rt
+
+#endif // ORC_RT_CONTROLLERINTERFACE_H
diff --git a/orc-rt/include/orc-rt/LockedAccess.h b/orc-rt/include/orc-rt/LockedAccess.h
new file mode 100644
index 0000000000000..01878b50dd3e8
--- /dev/null
+++ b/orc-rt/include/orc-rt/LockedAccess.h
@@ -0,0 +1,86 @@
+//===---------- LockedAccess.h - Locked access wrapper ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Convenience wrapper for simple locked access to a value.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_LOCKEDACCESS_H
+#define ORC_RT_LOCKEDACCESS_H
+
+#include <mutex>
+#include <utility>
+
+namespace orc_rt {
+
+/// A convenience wrapper for simple locked access to a value.
+///
+/// LockedAccess acquires a lock on construction and releases it on
+/// destruction, providing pointer-like access to the value in between.
+/// All accessors are rvalue-ref-qualified, so LockedAccess can only be used
+/// as a temporary — it cannot be stored in a variable or member.
+///
+/// This is intended for simple, short critical sections where a class wants
+/// to return locked access to an internal value. For more complex locking
+/// patterns (e.g. lock/unlock/relock, condition variables, multiple locks)
+/// use std::unique_lock or std::scoped_lock directly.
+template <typename T, typename LockT,
+          typename MutexT = typename LockT::mutex_type>
+class LockedAccess {
+public:
+  /// Construct a LockedAccess that references \p R and locks \p M.
+  LockedAccess(T &R, MutexT &M) : Lock(M), R(R) {}
+
+  // LockedAccess is not copyable or movable.
+  LockedAccess(const LockedAccess &) = delete;
+  LockedAccess &operator=(const LockedAccess &) = delete;
+  LockedAccess(LockedAccess &&) = delete;
+  LockedAccess &operator=(LockedAccess &&) = delete;
+
+  /// Returns a reference to the locked value. The returned reference must not
+  /// be used after this LockedAccess temporary is destroyed, as the lock will
+  /// no longer be held.
+  T &operator*() && noexcept { return R; }
+  const T &operator*() const && noexcept { return R; }
+
+  /// Returns a pointer to the locked value for member access. The pointer must
+  /// not be used after this LockedAccess temporary is destroyed, as the lock
+  /// will no longer be held.
+  T *operator->() && noexcept { return &R; }
+  const T *operator->() const && noexcept { return &R; }
+
+  /// Calls \p Op with a mutable reference to the locked value, returning
+  /// whatever \p Op returns. The lock is held for the duration of the call.
+  /// Use this for multi-statement critical sections.
+  template <typename OpT>
+  decltype(auto)
+  with_ref(OpT &&Op) && noexcept(noexcept(std::forward<OpT>(Op)(R))) {
+    return std::forward<OpT>(Op)(R);
+  }
+
+  /// Calls \p Op with a const reference to the locked value, returning
+  /// whatever \p Op returns. The lock is held for the duration of the call.
+  template <typename OpT>
+  decltype(auto) with_ref(OpT &&Op) const && noexcept(
+      noexcept(std::forward<OpT>(Op)(std::as_const(R)))) {
+    return std::forward<OpT>(Op)(std::as_const(R));
+  }
+
+private:
+  LockT Lock;
+  T &R;
+};
+
+/// Deduction guide: defaults LockT to std::scoped_lock<MutexT>.
+template <typename T, typename MutexT>
+LockedAccess(T &, MutexT &)
+    -> LockedAccess<T, std::scoped_lock<MutexT>, MutexT>;
+
+} // namespace orc_rt
+
+#endif // ORC_RT_LOCKEDACCESS_H
diff --git a/orc-rt/include/orc-rt/ResourceManager.h b/orc-rt/include/orc-rt/ResourceManager.h
deleted file mode 100644
index 6ae3b50f25f10..0000000000000
--- a/orc-rt/include/orc-rt/ResourceManager.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===- ResourceManager.h -- Interface for JIT resource managers -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// ResourceManager class and related APIs.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ORC_RT_RESOURCEMANAGER_H
-#define ORC_RT_RESOURCEMANAGER_H
-
-#include "orc-rt/Error.h"
-#include "orc-rt/move_only_function.h"
-
-namespace orc_rt {
-
-/// A ResourceManager manages resources (e.g. JIT'd memory) to support a JIT
-/// session.
-class ResourceManager {
-public:
-  using OnCompleteFn = move_only_function<void(Error)>;
-
-  virtual ~ResourceManager();
-
-  /// The onDetach method will be called if the controller disconnects from the
-  /// session without shutting the session down.
-  ///
-  /// Since no further requests for allocation will be made, the ResourceManager
-  /// may discard any book-keeping data-structures used to support allocation.
-  /// E.g. a JIT memory manager may discard its free-list, since no further
-  /// JIT'd allocations will happen.
-  virtual void onDetach(OnCompleteFn OnComplete) = 0;
-
-  /// The onShutdown operation will be called at the end of the session.
-  /// The ResourceManager should release all held resources.
-  virtual void onShutdown(OnCompleteFn OnComplete) = 0;
-};
-} // namespace orc_rt
-
-#endif // ORC_RT_RESOURCEMANAGER_H
diff --git a/orc-rt/include/orc-rt/SPSWrapperFunction.h b/orc-rt/include/orc-rt/SPSWrapperFunction.h
index d61792db94351..4f32179545e58 100644
--- a/orc-rt/include/orc-rt/SPSWrapperFunction.h
+++ b/orc-rt/include/orc-rt/SPSWrapperFunction.h
@@ -18,7 +18,12 @@
 #include "orc-rt/SimplePackedSerialization.h"
 #include "orc-rt/WrapperFunction.h"
 
-#define ORC_RT_SPS_INTERFACE ORC_RT_INTERFACE
+#define ORC_RT_SPS_WRAPPER(Name, SPSSig, Handle)                               \
+  static void Name(orc_rt_SessionRef S, uint64_t CallId,                       \
+                   orc_rt_WrapperFunctionReturn Return,                        \
+                   orc_rt_WrapperFunctionBuffer ArgBytes) {                    \
+    SPSWrapperFunction<SPSSig>::handle(S, CallId, Return, ArgBytes, Handle);   \
+  }
 
 namespace orc_rt {
 namespace detail {
diff --git a/orc-rt/include/orc-rt/Service.h b/orc-rt/include/orc-rt/Service.h
new file mode 100644
index 0000000000000..23c3f189ae6a4
--- /dev/null
+++ b/orc-rt/include/orc-rt/Service.h
@@ -0,0 +1,47 @@
+//===-------- Service.h - Interface for Session Services --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Service class and related APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_SERVICE_H
+#define ORC_RT_SERVICE_H
+
+#include "orc-rt/Error.h"
+#include "orc-rt/move_only_function.h"
+
+namespace orc_rt {
+
+/// A Service typically manages some resource(s) or performs some actions on
+/// behalf of a Session. E.g. a Memory Manager service.
+/// Services are owned by the Session and notified when the controller
+/// detaches, and when the Session shuts down.
+class Service {
+public:
+  using OnCompleteFn = move_only_function<void()>;
+
+  virtual ~Service();
+
+  /// The onDetach method will be called if the controller disconnects from the
+  /// session without shutting the session down.
+  ///
+  /// Since no further requests to the Service will be made, the Service may
+  /// discard any book-keeping data-structures that are only needed to serve
+  /// ongoing requests. E.g. a JIT memory manager may discard its free-list,
+  /// since no further JIT'd allocations will happen.
+  virtual void onDetach(OnCompleteFn OnComplete) = 0;
+
+  /// The onShutdown operation will be called at the end of the session.
+  ///
+  /// The Service should release any held resources.
+  virtual void onShutdown(OnCompleteFn OnComplete) = 0;
+};
+} // namespace orc_rt
+
+#endif // ORC_RT_SERVICE_H
diff --git a/orc-rt/include/orc-rt/Session.h b/orc-rt/include/orc-rt/Session.h
index 3494107306e67..2b726b817f8bc 100644
--- a/orc-rt/include/orc-rt/Session.h
+++ b/orc-rt/include/orc-rt/Session.h
@@ -13,8 +13,10 @@
 #ifndef ORC_RT_SESSION_H
 #define ORC_RT_SESSION_H
 
+#include "orc-rt/ControllerInterface.h"
 #include "orc-rt/Error.h"
-#include "orc-rt/ResourceManager.h"
+#include "orc-rt/LockedAccess.h"
+#include "orc-rt/Service.h"
 #include "orc-rt/TaskDispatcher.h"
 #include "orc-rt/WrapperFunction.h"
 #include "orc-rt/move_only_function.h"
@@ -116,9 +118,7 @@ class Session {
   /// Note that entry into the reporter is not synchronized: it may be
   /// called from multiple threads concurrently.
   Session(std::unique_ptr<TaskDispatcher> Dispatcher,
-          ErrorReporterFn ReportError)
-      : Dispatcher(std::move(Dispatcher)), ReportError(std::move(ReportError)) {
-  }
+          ErrorReporterFn ReportError);
 
   // Sessions are not copyable or moveable.
   Session(const Session &) = delete;
@@ -134,6 +134,10 @@ class Session {
   /// Report an error via the ErrorReporter function.
   void reportError(Error Err) { ReportError(std::move(Err)); }
 
+  /// Controller interface symbols map.
+  auto controllerInterface() { return LockedAccess(CI, M); }
+  auto controllerInterface() const { return LockedAccess(CI, M); }
+
   /// Initiate session shutdown.
   ///
   /// Runs shutdown on registered resources in reverse order.
@@ -142,8 +146,23 @@ class Session {
   /// Initiate session shutdown and block until complete.
   void waitForShutdown();
 
-  /// Add a ResourceManager to the session.
-  void addResourceManager(std::unique_ptr<ResourceManager> RM);
+  /// Add a Service to the session.
+  template <typename ServiceT>
+  ServiceT &addService(std::unique_ptr<ServiceT> Srv) {
+    assert(Srv && "addService called with null value");
+    ServiceT &Ref = *Srv;
+    std::scoped_lock<std::mutex> Lock(M);
+    assert(!SI && "addService called after shutdown");
+    Services.push_back(std::move(Srv));
+    return Ref;
+  }
+
+  /// Construct an instance of ServiceT from the given arguments and add it to
+  /// the Session.
+  template <typename ServiceT, typename... ArgTs>
+  ServiceT &createService(ArgTs &&...Args) {
+    return addService(std::make_unique<ServiceT>(std::forward<ArgTs>(Args)...));
+  }
 
   /// Set the ControllerAccess object.
   void setController(std::shared_ptr<ControllerAccess> CA);
@@ -163,11 +182,11 @@ class Session {
 private:
   struct ShutdownInfo {
     bool Complete = false;
-    std::vector<std::unique_ptr<ResourceManager>> ResourceMgrs;
+    std::vector<std::unique_ptr<Service>> Services;
     std::vector<OnShutdownCompleteFn> OnCompletes;
   };
 
-  void shutdownNext(Error Err);
+  void shutdownNext();
   void shutdownComplete();
 
   void handleWrapperCall(uint64_t CallId, orc_rt_WrapperFunction Fn,
@@ -189,8 +208,9 @@ class Session {
   std::shared_ptr<ControllerAccess> CA;
   ErrorReporterFn ReportError;
 
-  std::mutex M;
-  std::vector<std::unique_ptr<ResourceManager>> ResourceMgrs;
+  mutable std::mutex M;
+  std::vector<std::unique_ptr<Service>> Services;
+  ControllerInterface CI;
   std::unique_ptr<ShutdownInfo> SI;
 };
 
diff --git a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
index 832f577636b13..d98e094c6fb9f 100644
--- a/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
+++ b/orc-rt/include/orc-rt/SimpleNativeMemoryMap.h
@@ -16,8 +16,7 @@
 #include "orc-rt/AllocAction.h"
 #include "orc-rt/Error.h"
 #include "orc-rt/MemoryFlags.h"
-#include "orc-rt/ResourceManager.h"
-#include "orc-rt/SPSWrapperFunction.h"
+#include "orc-rt/Service.h"
 #include "orc-rt/move_only_function.h"
 
 #include <map>
@@ -40,7 +39,7 @@ namespace orc_rt {
 /// 4. Release address space, deinitializing any remaining initialized
 ///    regions, and returning the address space to the system for reuse (if
 ///    the system permits).
-class SimpleNativeMemoryMap : public ResourceManager {
+class SimpleNativeMemoryMap : public Service {
 public:
   /// Reserves a slab of contiguous address space for allocation.
   ///
@@ -86,8 +85,8 @@ class SimpleNativeMemoryMap : public ResourceManager {
   void deinitializeMultiple(OnDeinitializeCompleteFn &&OnComplete,
                             std::vector<void *> Bases);
 
-  void onDetach(ResourceManager::OnCompleteFn OnComplete) override;
-  void onShutdown(ResourceManager::OnCompleteFn OnComplete) override;
+  void onDetach(Service::OnCompleteFn OnComplete) override;
+  void onShutdown(Service::OnCompleteFn OnComplete) override;
 
 private:
   struct SlabInfo {
@@ -113,22 +112,4 @@ class SimpleNativeMemoryMap : public ResourceManager {
 
 } // namespace orc_rt
 
-ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes);
-
-ORC_RT_SPS_INTERFACE void
-orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes);
-
-ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes);
-
-ORC_RT_SPS_INTERFACE void
-orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes);
-
 #endif // ORC_RT_SIMPLENATIVEMEMORYMAP_H
diff --git a/orc-rt/include/orc-rt/iterator_range.h b/orc-rt/include/orc-rt/iterator_range.h
new file mode 100644
index 0000000000000..207932dde147d
--- /dev/null
+++ b/orc-rt/include/orc-rt/iterator_range.h
@@ -0,0 +1,47 @@
+//===---- iterator_range.h -- Simple iterator range template ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple iterator range template.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_ITERATOR_RANGE_H
+#define ORC_RT_ITERATOR_RANGE_H
+
+#include <iterator>
+
+namespace orc_rt {
+
+/// A simple wrapper around a pair of iterators, enabling range-based for
+/// loops over iterator pairs or subranges of containers.
+template <typename IteratorT> class iterator_range {
+public:
+  /// Construct an iterator_range from a container or range. The underlying
+  /// container must outlive this iterator_range.
+  template <typename Container>
+  iterator_range(Container &&C) : Begin(std::begin(C)), End(std::end(C)) {}
+
+  /// Construct an iterator_range from an explicit begin/end pair.
+  iterator_range(IteratorT Begin, IteratorT End)
+      : Begin(std::move(Begin)), End(std::move(End)) {}
+
+  IteratorT begin() const { return Begin; }
+  IteratorT end() const { return End; }
+  bool empty() const { return Begin == End; }
+
+private:
+  IteratorT Begin, End;
+};
+
+template <typename Container>
+iterator_range(Container &&)
+    -> iterator_range<decltype(std::begin(std::declval<Container &&>()))>;
+
+} // namespace orc_rt
+
+#endif // ORC_RT_ITERATOR_RANGE_H
diff --git a/orc-rt/include/orc-rt/sps-ci/AllSPSCI.h b/orc-rt/include/orc-rt/sps-ci/AllSPSCI.h
new file mode 100644
index 0000000000000..e92ba6ad39149
--- /dev/null
+++ b/orc-rt/include/orc-rt/sps-ci/AllSPSCI.h
@@ -0,0 +1,26 @@
+//===- AllSPSCI.h -- All SPS Controller Interface registrations -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Convenience header that includes all SPS Controller Interface headers and
+// declares addAll.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_SPS_CI_ALLSPSCI_H
+#define ORC_RT_SPS_CI_ALLSPSCI_H
+
+#include "orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h"
+
+namespace orc_rt::sps_ci {
+
+/// Add all SPS interfaces to the controller interface.
+Error addAll(ControllerInterface &CI);
+
+} // namespace orc_rt::sps_ci
+
+#endif // ORC_RT_SPS_CI_ALLSPSCI_H
diff --git a/orc-rt/include/orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h b/orc-rt/include/orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h
new file mode 100644
index 0000000000000..7d8e705e1b3a8
--- /dev/null
+++ b/orc-rt/include/orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h
@@ -0,0 +1,25 @@
+//===------- SimpleNativeMemoryMapSPSCI.h -- SNMM SPS CI --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// SPS Controller Interface registration for SimpleNativeMemoryMap.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ORC_RT_SPS_CI_SIMPLENATIVEMEMORYMAPSPSCI_H
+#define ORC_RT_SPS_CI_SIMPLENATIVEMEMORYMAPSPSCI_H
+
+#include "orc-rt/ControllerInterface.h"
+
+namespace orc_rt::sps_ci {
+
+/// Add the SimpleNativeMemoryMap SPS interface to the controller interface.
+Error addSimpleNativeMemoryMap(ControllerInterface &CI);
+
+} // namespace orc_rt::sps_ci
+
+#endif // ORC_RT_SPS_CI_SIMPLENATIVEMEMORYMAPSPSCI_H
diff --git a/orc-rt/lib/executor/CMakeLists.txt b/orc-rt/lib/executor/CMakeLists.txt
index cca5246b4d127..6d05386a097fc 100644
--- a/orc-rt/lib/executor/CMakeLists.txt
+++ b/orc-rt/lib/executor/CMakeLists.txt
@@ -1,13 +1,16 @@
 set(files
   AllocAction.cpp
+  ControllerInterface.cpp
   Error.cpp
   QueueingTaskDispatcher.cpp
-  ResourceManager.cpp
   RTTI.cpp
+  Service.cpp
   Session.cpp
   SimpleNativeMemoryMap.cpp
   TaskDispatcher.cpp
   ThreadPoolTaskDispatcher.cpp
+  sps-ci/AllSPSCI.cpp
+  sps-ci/SimpleNativeMemoryMapSPSCI.cpp
   )
 
 add_library(orc-rt-executor STATIC ${files})
diff --git a/orc-rt/lib/executor/ControllerInterface.cpp b/orc-rt/lib/executor/ControllerInterface.cpp
new file mode 100644
index 0000000000000..f065c1dfe01a7
--- /dev/null
+++ b/orc-rt/lib/executor/ControllerInterface.cpp
@@ -0,0 +1,34 @@
+//===- ControllerInterface.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains the implementation of APIs in the orc-rt/ControllerInterface.h
+// header.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/ControllerInterface.h"
+#include "orc-rt/iterator_range.h"
+
+#include <algorithm>
+
+namespace orc_rt {
+
+Error ControllerInterface::makeDuplicatesError(
+    std::vector<std::string_view> Dups) {
+  std::sort(Dups.begin(), Dups.end());
+  std::string ErrMsg = "Could not add duplicate symbols: [ ";
+  ErrMsg += Dups.front();
+  for (auto &Dup : iterator_range(std::next(Dups.begin()), Dups.end())) {
+    ErrMsg += ", ";
+    ErrMsg += Dup;
+  }
+  ErrMsg += " ]";
+  return make_error<StringError>(std::move(ErrMsg));
+}
+
+} // namespace orc_rt
diff --git a/orc-rt/lib/executor/ResourceManager.cpp b/orc-rt/lib/executor/Service.cpp
similarity index 65%
rename from orc-rt/lib/executor/ResourceManager.cpp
rename to orc-rt/lib/executor/Service.cpp
index 171def567a042..e4b75da416554 100644
--- a/orc-rt/lib/executor/ResourceManager.cpp
+++ b/orc-rt/lib/executor/Service.cpp
@@ -1,4 +1,4 @@
-//===- ResourceManager.cpp ------------------------------------------------===//
+//===- Service.cpp --------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Contains the implementation of APIs in the orc-rt/ResourceManager.h header.
+// Contains the implementation of APIs in the orc-rt/Service.h header.
 //
 //===----------------------------------------------------------------------===//
 
-#include "orc-rt/ResourceManager.h"
+#include "orc-rt/Service.h"
 
 namespace orc_rt {
 
-ResourceManager::~ResourceManager() = default;
+Service::~Service() = default;
 
 } // namespace orc_rt
diff --git a/orc-rt/lib/executor/Session.cpp b/orc-rt/lib/executor/Session.cpp
index 741abdc44bb98..029218eef8d5a 100644
--- a/orc-rt/lib/executor/Session.cpp
+++ b/orc-rt/lib/executor/Session.cpp
@@ -16,6 +16,15 @@ namespace orc_rt {
 
 Session::ControllerAccess::~ControllerAccess() = default;
 
+Session::Session(std::unique_ptr<TaskDispatcher> Dispatcher,
+                 ErrorReporterFn ReportError)
+    : Dispatcher(std::move(Dispatcher)), ReportError(std::move(ReportError)) {
+  std::pair<const char *, void *> InitialSymbols[] = {
+      {"orc_rt_SessionInstance", static_cast<void *>(this)}};
+
+  cantFail(CI.addSymbolsUnique(InitialSymbols));
+}
+
 Session::~Session() { waitForShutdown(); }
 
 void Session::shutdown(OnShutdownCompleteFn OnShutdownComplete) {
@@ -40,7 +49,7 @@ void Session::shutdown(OnShutdownCompleteFn OnShutdownComplete) {
       // callbacks, then call shutdownNext below (outside the lock).
       SI = std::make_unique<ShutdownInfo>();
       SI->OnCompletes.push_back(std::move(OnShutdownComplete));
-      std::swap(SI->ResourceMgrs, ResourceMgrs);
+      std::swap(SI->Services, Services);
     }
   }
 
@@ -53,7 +62,7 @@ void Session::shutdown(OnShutdownCompleteFn OnShutdownComplete) {
   // OnShutdownComplete is _not_ set (i.e. was moved into the list of pending
   // handlers), and we didn't return under the lock above, so we must be
   // responsible for the shutdown. Call shutdownNext.
-  shutdownNext(Error::success());
+  shutdownNext();
 }
 
 void Session::waitForShutdown() {
@@ -63,12 +72,6 @@ void Session::waitForShutdown() {
   F.get();
 }
 
-void Session::addResourceManager(std::unique_ptr<ResourceManager> RM) {
-  std::scoped_lock<std::mutex> Lock(M);
-  assert(!SI && "addResourceManager called after shutdown");
-  ResourceMgrs.push_back(std::move(RM));
-}
-
 void Session::setController(std::shared_ptr<ControllerAccess> CA) {
   assert(CA && "Cannot attach null controller");
   std::scoped_lock<std::mutex> Lock(M);
@@ -84,17 +87,14 @@ void Session::detachFromController() {
   }
 }
 
-void Session::shutdownNext(Error Err) {
-  if (Err)
-    reportError(std::move(Err));
-
-  if (SI->ResourceMgrs.empty())
+void Session::shutdownNext() {
+  if (SI->Services.empty())
     return shutdownComplete();
 
-  // Get the next ResourceManager to shut down.
-  auto NextRM = std::move(SI->ResourceMgrs.back());
-  SI->ResourceMgrs.pop_back();
-  NextRM->onShutdown([this](Error Err) { shutdownNext(std::move(Err)); });
+  // Get the next Service to shut down.
+  auto NextSrv = std::move(SI->Services.back());
+  SI->Services.pop_back();
+  NextSrv->onShutdown([this]() { shutdownNext(); });
 }
 
 void Session::shutdownComplete() {
diff --git a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
index 4fb31acf9def6..49ccf9382a009 100644
--- a/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
+++ b/orc-rt/lib/executor/SimpleNativeMemoryMap.cpp
@@ -14,8 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "orc-rt/SimpleNativeMemoryMap.h"
-#include "orc-rt/SPSAllocAction.h"
-#include "orc-rt/SPSMemoryFlags.h"
 #include <sstream>
 
 #if defined(__APPLE__) || defined(__linux__)
@@ -26,47 +24,6 @@
 
 namespace orc_rt {
 
-struct SPSSimpleNativeMemoryMapSegment;
-
-template <>
-class SPSSerializationTraits<
-    SPSSimpleNativeMemoryMapSegment,
-    SimpleNativeMemoryMap::InitializeRequest::Segment> {
-  using SPSType =
-      SPSTuple<SPSAllocGroup, SPSExecutorAddr, uint64_t, SPSSequence<char>>;
-
-public:
-  static bool
-  deserialize(SPSInputBuffer &IB,
-              SimpleNativeMemoryMap::InitializeRequest::Segment &S) {
-    AllocGroup AG;
-    ExecutorAddr Address;
-    uint64_t Size;
-    span<const char> Content;
-    if (!SPSType::AsArgList::deserialize(IB, AG, Address, Size, Content))
-      return false;
-    if (Size > std::numeric_limits<size_t>::max())
-      return false;
-    S = {AG, Address.toPtr<char *>(), static_cast<size_t>(Size), Content};
-    return true;
-  }
-};
-
-struct SPSSimpleNativeMemoryMapInitializeRequest;
-
-template <>
-class SPSSerializationTraits<SPSSimpleNativeMemoryMapInitializeRequest,
-                             SimpleNativeMemoryMap::InitializeRequest> {
-  using SPSType = SPSTuple<SPSSequence<SPSSimpleNativeMemoryMapSegment>,
-                           SPSSequence<SPSAllocActionPair>>;
-
-public:
-  static bool deserialize(SPSInputBuffer &IB,
-                          SimpleNativeMemoryMap::InitializeRequest &FR) {
-    return SPSType::AsArgList::deserialize(IB, FR.Segments, FR.AAPs);
-  }
-};
-
 void SimpleNativeMemoryMap::reserve(OnReserveCompleteFn &&OnComplete,
                                     size_t Size) {
   // FIXME: Get page size from session object.
@@ -222,14 +179,13 @@ void SimpleNativeMemoryMap::deinitializeMultiple(
                    Error::success());
 }
 
-void SimpleNativeMemoryMap::onDetach(ResourceManager::OnCompleteFn OnComplete) {
+void SimpleNativeMemoryMap::onDetach(Service::OnCompleteFn OnComplete) {
   // Detach is a noop for now: we just retain all actions to run at shutdown
   // time.
-  OnComplete(Error::success());
+  OnComplete();
 }
 
-void SimpleNativeMemoryMap::onShutdown(
-    ResourceManager::OnCompleteFn OnComplete) {
+void SimpleNativeMemoryMap::onShutdown(Service::OnCompleteFn OnComplete) {
   // TODO: Establish a clear order to run deallocate actions across slabs,
   // object boundaries.
 
@@ -302,10 +258,10 @@ void SimpleNativeMemoryMap::deinitializeNext(
       NextAddr);
 }
 
-void SimpleNativeMemoryMap::shutdownNext(
-    ResourceManager::OnCompleteFn OnComplete, std::vector<void *> Bases) {
+void SimpleNativeMemoryMap::shutdownNext(Service::OnCompleteFn OnComplete,
+                                         std::vector<void *> Bases) {
   if (Bases.empty())
-    return OnComplete(Error::success());
+    return OnComplete();
 
   auto *Base = Bases.back();
   Bases.pop_back();
@@ -367,44 +323,4 @@ Error SimpleNativeMemoryMap::recordDeallocActions(
   return Error::success();
 }
 
-ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes) {
-  using Sig = SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSSize);
-  SPSWrapperFunction<Sig>::handle(
-      S, CallId, Return, ArgBytes,
-      WrapperFunction::handleWithAsyncMethod(&SimpleNativeMemoryMap::reserve));
-}
-
-ORC_RT_SPS_INTERFACE void
-orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes) {
-  using Sig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
-  SPSWrapperFunction<Sig>::handle(S, CallId, Return, ArgBytes,
-                                  WrapperFunction::handleWithAsyncMethod(
-                                      &SimpleNativeMemoryMap::releaseMultiple));
-}
-
-ORC_RT_SPS_INTERFACE void orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes) {
-  using Sig = SPSExpected<SPSExecutorAddr>(
-      SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest);
-  SPSWrapperFunction<Sig>::handle(S, CallId, Return, ArgBytes,
-                                  WrapperFunction::handleWithAsyncMethod(
-                                      &SimpleNativeMemoryMap::initialize));
-}
-
-ORC_RT_SPS_INTERFACE void
-orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper(
-    orc_rt_SessionRef S, uint64_t CallId, orc_rt_WrapperFunctionReturn Return,
-    orc_rt_WrapperFunctionBuffer ArgBytes) {
-  using Sig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
-  SPSWrapperFunction<Sig>::handle(
-      S, CallId, Return, ArgBytes,
-      WrapperFunction::handleWithAsyncMethod(
-          &SimpleNativeMemoryMap::deinitializeMultiple));
-}
-
 } // namespace orc_rt
diff --git a/orc-rt/lib/executor/sps-ci/AllSPSCI.cpp b/orc-rt/lib/executor/sps-ci/AllSPSCI.cpp
new file mode 100644
index 0000000000000..e1f002781132e
--- /dev/null
+++ b/orc-rt/lib/executor/sps-ci/AllSPSCI.cpp
@@ -0,0 +1,28 @@
+//===- AllSPSCI.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of sps_ci::addAll.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/sps-ci/AllSPSCI.h"
+
+namespace orc_rt::sps_ci {
+
+Error addAll(ControllerInterface &CI) {
+  using AdderFn = Error (*)(ControllerInterface &);
+  AdderFn Adders[] = {addSimpleNativeMemoryMap};
+
+  for (auto *Adder : Adders)
+    if (auto Err = Adder(CI))
+      return Err;
+
+  return Error::success();
+}
+
+} // namespace orc_rt::sps_ci
diff --git a/orc-rt/lib/executor/sps-ci/SimpleNativeMemoryMapSPSCI.cpp b/orc-rt/lib/executor/sps-ci/SimpleNativeMemoryMapSPSCI.cpp
new file mode 100644
index 0000000000000..337f62b34a7fe
--- /dev/null
+++ b/orc-rt/lib/executor/sps-ci/SimpleNativeMemoryMapSPSCI.cpp
@@ -0,0 +1,104 @@
+//===- SimpleNativeMemoryMapSPSCI.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// SPS Controller Interface implementation for SimpleNativeMemoryMap.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h"
+
+#include "orc-rt/SPSAllocAction.h"
+#include "orc-rt/SPSMemoryFlags.h"
+#include "orc-rt/SPSWrapperFunction.h"
+#include "orc-rt/SimpleNativeMemoryMap.h"
+
+namespace orc_rt {
+
+struct SPSSimpleNativeMemoryMapSegment;
+
+template <>
+class SPSSerializationTraits<
+    SPSSimpleNativeMemoryMapSegment,
+    SimpleNativeMemoryMap::InitializeRequest::Segment> {
+  using SPSType =
+      SPSTuple<SPSAllocGroup, SPSExecutorAddr, uint64_t, SPSSequence<char>>;
+
+public:
+  static bool
+  deserialize(SPSInputBuffer &IB,
+              SimpleNativeMemoryMap::InitializeRequest::Segment &S) {
+    AllocGroup AG;
+    ExecutorAddr Address;
+    uint64_t Size;
+    span<const char> Content;
+    if (!SPSType::AsArgList::deserialize(IB, AG, Address, Size, Content))
+      return false;
+    if (Size > std::numeric_limits<size_t>::max())
+      return false;
+    S = {AG, Address.toPtr<char *>(), static_cast<size_t>(Size), Content};
+    return true;
+  }
+};
+
+struct SPSSimpleNativeMemoryMapInitializeRequest;
+
+template <>
+class SPSSerializationTraits<SPSSimpleNativeMemoryMapInitializeRequest,
+                             SimpleNativeMemoryMap::InitializeRequest> {
+  using SPSType = SPSTuple<SPSSequence<SPSSimpleNativeMemoryMapSegment>,
+                           SPSSequence<SPSAllocActionPair>>;
+
+public:
+  static bool deserialize(SPSInputBuffer &IB,
+                          SimpleNativeMemoryMap::InitializeRequest &FR) {
+    return SPSType::AsArgList::deserialize(IB, FR.Segments, FR.AAPs);
+  }
+};
+
+namespace sps_ci {
+
+ORC_RT_SPS_WRAPPER(
+    orc_rt_sps_ci_SimpleNativeMemoryMap_reserve_sps_wrapper,
+    SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSSize),
+    WrapperFunction::handleWithAsyncMethod(&SimpleNativeMemoryMap::reserve))
+
+ORC_RT_SPS_WRAPPER(
+    orc_rt_sps_ci_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper,
+    SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>),
+    WrapperFunction::handleWithAsyncMethod(
+        &SimpleNativeMemoryMap::releaseMultiple))
+
+ORC_RT_SPS_WRAPPER(
+    orc_rt_sps_ci_SimpleNativeMemoryMap_initialize_sps_wrapper,
+    SPSExpected<SPSExecutorAddr>(SPSExecutorAddr,
+                                 SPSSimpleNativeMemoryMapInitializeRequest),
+    WrapperFunction::handleWithAsyncMethod(&SimpleNativeMemoryMap::initialize))
+
+ORC_RT_SPS_WRAPPER(
+    orc_rt_sps_ci_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper,
+    SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>),
+    WrapperFunction::handleWithAsyncMethod(
+        &SimpleNativeMemoryMap::deinitializeMultiple))
+
+static std::pair<const char *, const void *>
+    orc_rt_sps_ci_SimpleNativeMemoryMap_sps_interface[] = {
+        ORC_RT_SYMTAB_PAIR(
+            orc_rt_sps_ci_SimpleNativeMemoryMap_reserve_sps_wrapper),
+        ORC_RT_SYMTAB_PAIR(
+            orc_rt_sps_ci_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper),
+        ORC_RT_SYMTAB_PAIR(
+            orc_rt_sps_ci_SimpleNativeMemoryMap_initialize_sps_wrapper),
+        ORC_RT_SYMTAB_PAIR(
+            orc_rt_sps_ci_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper)};
+
+Error addSimpleNativeMemoryMap(ControllerInterface &CI) {
+  return CI.addSymbolsUnique(orc_rt_sps_ci_SimpleNativeMemoryMap_sps_interface);
+}
+
+} // namespace sps_ci
+} // namespace orc_rt
diff --git a/orc-rt/unittests/CMakeLists.txt b/orc-rt/unittests/CMakeLists.txt
index 963e4da71a200..a7cfa616c5ab5 100644
--- a/orc-rt/unittests/CMakeLists.txt
+++ b/orc-rt/unittests/CMakeLists.txt
@@ -15,6 +15,7 @@ add_orc_rt_unittest(CoreTests
   AllocActionTest.cpp
   BitmaskEnumTest.cpp
   CallableTraitsHelperTest.cpp
+  ControllerInterfaceTest.cpp
   EndianTest.cpp
   ErrorCAPITest.cpp
   ErrorTest.cpp
@@ -22,6 +23,7 @@ add_orc_rt_unittest(CoreTests
   ExecutorAddressTest.cpp
   IntervalMapTest.cpp
   IntervalSetTest.cpp
+  LockedAccessTest.cpp
   MathTest.cpp
   MemoryFlagsTest.cpp
   QueueingTaskDispatcherTest.cpp
@@ -29,6 +31,7 @@ add_orc_rt_unittest(CoreTests
   ScopeExitTest.cpp
   SessionTest.cpp
   SimpleNativeMemoryMapTest.cpp
+  SimpleNativeMemoryMapSPSCITest.cpp
   SimplePackedSerializationTest.cpp
   SPSAllocActionTest.cpp
   SPSMemoryFlagsTest.cpp
@@ -38,8 +41,10 @@ add_orc_rt_unittest(CoreTests
   WrapperFunctionBufferTest.cpp
   bind-test.cpp
   bit-test.cpp
+  iterator_range-test.cpp
   move_only_function-test.cpp
   span-test.cpp
+
   DISABLE_LLVM_LINK_LLVM_DYLIB
   )
 target_compile_options(CoreTests PRIVATE ${ORC_RT_COMPILE_FLAGS})
diff --git a/orc-rt/unittests/CommonTestUtils.h b/orc-rt/unittests/CommonTestUtils.h
index 1c66bddaf75be..d5a6c644537e1 100644
--- a/orc-rt/unittests/CommonTestUtils.h
+++ b/orc-rt/unittests/CommonTestUtils.h
@@ -9,7 +9,10 @@
 #ifndef ORC_RT_UNITTEST_COMMONTESTUTILS_H
 #define ORC_RT_UNITTEST_COMMONTESTUTILS_H
 
+#include "orc-rt/move_only_function.h"
+
 #include <cstddef>
+#include <future>
 
 template <size_t Idx = 0> class OpCounter {
 public:
@@ -64,4 +67,17 @@ template <size_t Idx> size_t OpCounter<Idx>::MoveConstructions = 0;
 template <size_t Idx> size_t OpCounter<Idx>::MoveAssignments = 0;
 template <size_t Idx> size_t OpCounter<Idx>::Destructions = 0;
 
+template <typename T>
+orc_rt::move_only_function<void(T)> waitFor(std::future<T> &F) {
+  std::promise<T> P;
+  F = P.get_future();
+  return [P = std::move(P)](T Val) mutable { P.set_value(std::move(Val)); };
+}
+
+inline orc_rt::move_only_function<void()> waitFor(std::future<void> &F) {
+  std::promise<void> P;
+  F = P.get_future();
+  return [P = std::move(P)]() mutable { P.set_value(); };
+}
+
 #endif // ORC_RT_UNITTEST_COMMONTESTUTILS_H
diff --git a/orc-rt/unittests/ControllerInterfaceTest.cpp b/orc-rt/unittests/ControllerInterfaceTest.cpp
new file mode 100644
index 0000000000000..9ca6aba2ca768
--- /dev/null
+++ b/orc-rt/unittests/ControllerInterfaceTest.cpp
@@ -0,0 +1,146 @@
+//===- ControllerInterfaceTest.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for orc-rt's ControllerInterface.h APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/ControllerInterface.h"
+#include "gtest/gtest.h"
+
+#include <set>
+#include <string>
+
+using namespace orc_rt;
+
+TEST(ControllerInterfaceTest, EmptyByDefault) {
+  ControllerInterface CI;
+  EXPECT_TRUE(CI.empty());
+  EXPECT_EQ(CI.size(), 0U);
+  EXPECT_EQ(CI.begin(), CI.end());
+}
+
+TEST(ControllerInterfaceTest, AddSymbolsUnique) {
+  ControllerInterface CI;
+  int X = 0, Y = 0;
+  std::pair<const char *, void *> Syms[] = {{"orc_rt_A", &X}, {"orc_rt_B", &Y}};
+
+  auto Err = CI.addSymbolsUnique(Syms);
+  EXPECT_FALSE(Err) << "Unexpected error adding unique symbols";
+
+  EXPECT_EQ(CI.size(), 2U);
+  EXPECT_FALSE(CI.empty());
+  EXPECT_TRUE(CI.count("orc_rt_A"));
+  EXPECT_TRUE(CI.count("orc_rt_B"));
+  EXPECT_EQ(CI.at("orc_rt_A"), &X);
+  EXPECT_EQ(CI.at("orc_rt_B"), &Y);
+}
+
+TEST(ControllerInterfaceTest, AddConstPointers) {
+  ControllerInterface CI;
+  const int X = 42;
+  const int Y = 7;
+  std::pair<const char *, const void *> Syms[] = {{"orc_rt_A", &X},
+                                                  {"orc_rt_B", &Y}};
+  cantFail(CI.addSymbolsUnique(Syms));
+
+  EXPECT_EQ(CI.at("orc_rt_A"), &X);
+  EXPECT_EQ(CI.at("orc_rt_B"), &Y);
+}
+
+TEST(ControllerInterfaceTest, AddSymbolsUniqueMultipleCalls) {
+  ControllerInterface CI;
+  int X = 0, Y = 0;
+
+  std::pair<const char *, void *> First[] = {{"orc_rt_A", &X}};
+  std::pair<const char *, void *> Second[] = {{"orc_rt_B", &Y}};
+
+  cantFail(CI.addSymbolsUnique(First));
+  cantFail(CI.addSymbolsUnique(Second));
+
+  EXPECT_EQ(CI.size(), 2U);
+  EXPECT_EQ(CI.at("orc_rt_A"), &X);
+  EXPECT_EQ(CI.at("orc_rt_B"), &Y);
+}
+
+TEST(ControllerInterfaceTest, AddSymbolsUniqueDuplicateRejected) {
+  ControllerInterface CI;
+  int X = 0, Y = 0;
+
+  std::pair<const char *, void *> First[] = {{"orc_rt_A", &X}};
+  cantFail(CI.addSymbolsUnique(First));
+
+  std::pair<const char *, void *> Second[] = {{"orc_rt_A", &Y}};
+  auto Err = CI.addSymbolsUnique(Second);
+  EXPECT_TRUE(Err.isA<StringError>());
+
+  auto ErrMsg = toString(std::move(Err));
+  EXPECT_NE(ErrMsg.find("orc_rt_A"), std::string::npos)
+      << "Error message should mention the duplicate symbol name";
+
+  // Original not overwritten.
+  EXPECT_EQ(CI.at("orc_rt_A"), &X);
+}
+
+TEST(ControllerInterfaceTest, AddSymbolsUniqueMultipleDuplicates) {
+  ControllerInterface CI;
+  int X = 0, Y = 0, Z = 0;
+
+  std::pair<const char *, void *> First[] = {{"orc_rt_A", &X},
+                                             {"orc_rt_B", &Y}};
+  cantFail(CI.addSymbolsUnique(First));
+
+  std::pair<const char *, void *> Second[] = {{"orc_rt_A", &Z},
+                                              {"orc_rt_B", &Z}};
+  auto Err = CI.addSymbolsUnique(Second);
+  EXPECT_TRUE(Err.isA<StringError>());
+
+  auto ErrMsg = toString(std::move(Err));
+  EXPECT_NE(ErrMsg.find("orc_rt_A"), std::string::npos);
+  EXPECT_NE(ErrMsg.find("orc_rt_B"), std::string::npos);
+
+  // Originals not overwritten.
+  EXPECT_EQ(CI.at("orc_rt_A"), &X);
+  EXPECT_EQ(CI.at("orc_rt_B"), &Y);
+}
+
+TEST(ControllerInterfaceTest, AddSymbolsUniqueAllOrNothing) {
+  ControllerInterface CI;
+  int X = 0, Y = 0, Z = 0;
+
+  std::pair<const char *, void *> First[] = {{"orc_rt_existing", &X}};
+  cantFail(CI.addSymbolsUnique(First));
+
+  // One new, one duplicate — neither should be added.
+  std::pair<const char *, void *> Second[] = {{"orc_rt_new", &Y},
+                                              {"orc_rt_existing", &Z}};
+  auto Err = CI.addSymbolsUnique(Second);
+  EXPECT_TRUE(Err.isA<StringError>());
+  consumeError(std::move(Err));
+
+  EXPECT_EQ(CI.size(), 1U);
+  EXPECT_EQ(CI.at("orc_rt_existing"), &X);
+  EXPECT_FALSE(CI.count("orc_rt_new"));
+}
+
+TEST(ControllerInterfaceTest, Iteration) {
+  ControllerInterface CI;
+  int X = 0, Y = 0, Z = 0;
+  std::pair<const char *, void *> Syms[] = {
+      {"orc_rt_A", &X}, {"orc_rt_B", &Y}, {"orc_rt_C", &Z}};
+  cantFail(CI.addSymbolsUnique(Syms));
+
+  std::set<std::string> Names;
+  for (auto &[Name, Addr] : CI)
+    Names.insert(Name);
+
+  EXPECT_EQ(Names.size(), 3U);
+  EXPECT_TRUE(Names.count("orc_rt_A"));
+  EXPECT_TRUE(Names.count("orc_rt_B"));
+  EXPECT_TRUE(Names.count("orc_rt_C"));
+}
diff --git a/orc-rt/unittests/LockedAccessTest.cpp b/orc-rt/unittests/LockedAccessTest.cpp
new file mode 100644
index 0000000000000..da93b370da139
--- /dev/null
+++ b/orc-rt/unittests/LockedAccessTest.cpp
@@ -0,0 +1,151 @@
+//===- LockedAccessTest.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for orc-rt's LockedAccess.h APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/LockedAccess.h"
+#include "gtest/gtest.h"
+
+#include <mutex>
+#include <thread>
+#include <type_traits>
+
+using namespace orc_rt;
+
+namespace {
+
+template <typename T> struct Foo {
+  std::mutex M;
+  T Val{};
+
+  Foo() = default;
+  Foo(T Val) : Val(std::move(Val)) {}
+
+  auto get() { return LockedAccess(Val, M); }
+
+  auto getConst() { return LockedAccess(std::as_const(Val), M); }
+};
+
+} // anonymous namespace
+
+TEST(LockedAccessTest, ArrowRead) {
+  Foo<std::pair<int, int>> F({42, 7});
+  EXPECT_EQ(F.get()->first, 42);
+  EXPECT_EQ(F.get()->second, 7);
+}
+
+TEST(LockedAccessTest, ArrowWrite) {
+  Foo<std::pair<int, int>> F;
+  F.get()->first = 10;
+  F.get()->second = 20;
+  EXPECT_EQ(F.Val.first, 10);
+  EXPECT_EQ(F.Val.second, 20);
+}
+
+TEST(LockedAccessTest, ConstArrowReturnsConstPtr) {
+  Foo<int> F(42);
+  static_assert(std::is_const_v<
+                    std::remove_pointer_t<decltype(F.getConst().operator->())>>,
+                "const overload should return const pointer");
+}
+
+TEST(LockedAccessTest, DerefRead) {
+  Foo<int> F(42);
+  EXPECT_EQ(*F.get(), 42);
+}
+
+TEST(LockedAccessTest, DerefWrite) {
+  Foo<int> F;
+  *F.get() = 7;
+  EXPECT_EQ(F.Val, 7);
+}
+
+TEST(LockedAccessTest, DerefPassToFunction) {
+  Foo<int> F(42);
+  auto timesTwo = [](int &X) { return X * 2; };
+  EXPECT_EQ(timesTwo(*F.get()), 84);
+}
+
+TEST(LockedAccessTest, ConstDerefReturnsConstRef) {
+  Foo<int> F(42);
+  static_assert(
+      std::is_const_v<std::remove_reference_t<decltype(*F.getConst())>>,
+      "const overload should return const reference");
+}
+
+TEST(LockedAccessTest, WithRefRead) {
+  Foo<int> F(42);
+  F.get().with_ref([](int &Y) { EXPECT_EQ(Y, 42); });
+}
+
+TEST(LockedAccessTest, WithRefWrite) {
+  Foo<int> F;
+  F.get().with_ref([](int &Y) { Y = 7; });
+  EXPECT_EQ(F.Val, 7);
+}
+
+TEST(LockedAccessTest, WithRefReturnValue) {
+  Foo<int> F(42);
+  int Result = F.get().with_ref([](int &Y) { return Y + 1; });
+  EXPECT_EQ(Result, 43);
+}
+
+TEST(LockedAccessTest, WithRefReturnReference) {
+  Foo<std::pair<int, int>> F({1, 2});
+  int &Ref = F.get().with_ref(
+      [](std::pair<int, int> &P) -> int & { return P.second; });
+  EXPECT_EQ(&Ref, &F.Val.second);
+}
+
+TEST(LockedAccessTest, ConstWithRefGetsConstReference) {
+  Foo<int> F(42);
+  F.getConst().with_ref([](const int &Y) { EXPECT_EQ(Y, 42); });
+}
+
+TEST(LockedAccessTest, WithRefMultiStatement) {
+  Foo<std::pair<int, int>> F;
+  F.get().with_ref([](std::pair<int, int> &P) {
+    P.first = 10;
+    P.second = P.first + 5;
+  });
+  EXPECT_EQ(F.Val.first, 10);
+  EXPECT_EQ(F.Val.second, 15);
+}
+
+TEST(LockedAccessTest, HoldsLockDuringArrow) {
+  struct Checker {
+    Foo<Checker> *F;
+    bool isLocked() { return !F->M.try_lock(); }
+  };
+  Foo<Checker> F({&F});
+  EXPECT_TRUE(F.get()->isLocked());
+}
+
+TEST(LockedAccessTest, HoldsLockDuringWithRef) {
+  Foo<int> F;
+  EXPECT_TRUE(F.get().with_ref([&](int &) { return !F.M.try_lock(); }));
+}
+
+TEST(LockedAccessTest, ProtectsAcrossThreads) {
+  Foo<int> F;
+  const int Iters = 10000;
+
+  auto Increment = [&]() {
+    for (int I = 0; I < Iters; ++I)
+      F.get().with_ref([](int &C) { ++C; });
+  };
+
+  std::thread T1(Increment);
+  std::thread T2(Increment);
+  T1.join();
+  T2.join();
+
+  EXPECT_EQ(F.Val, 2 * Iters);
+}
diff --git a/orc-rt/unittests/SessionTest.cpp b/orc-rt/unittests/SessionTest.cpp
index 7913a00503c5b..5f41477df12ad 100644
--- a/orc-rt/unittests/SessionTest.cpp
+++ b/orc-rt/unittests/SessionTest.cpp
@@ -26,33 +26,46 @@ using namespace orc_rt;
 using ::testing::Eq;
 using ::testing::Optional;
 
-class MockResourceManager : public ResourceManager {
+class MockService : public Service {
 public:
   enum class Op { Detach, Shutdown };
 
-  static Error alwaysSucceed(Op) { return Error::success(); }
+  static void noop(Op) {}
 
-  MockResourceManager(std::optional<size_t> &DetachOpIdx,
-                      std::optional<size_t> &ShutdownOpIdx, size_t &OpIdx,
-                      move_only_function<Error(Op)> GenResult = alwaysSucceed)
+  MockService(std::optional<size_t> &DetachOpIdx,
+              std::optional<size_t> &ShutdownOpIdx, size_t &OpIdx,
+              move_only_function<void(Op)> GenResult = noop)
       : DetachOpIdx(DetachOpIdx), ShutdownOpIdx(ShutdownOpIdx), OpIdx(OpIdx),
         GenResult(std::move(GenResult)) {}
 
   void onDetach(OnCompleteFn OnComplete) override {
     DetachOpIdx = OpIdx++;
-    OnComplete(GenResult(Op::Detach));
+    GenResult(Op::Detach);
+    OnComplete();
   }
 
   void onShutdown(OnCompleteFn OnComplete) override {
     ShutdownOpIdx = OpIdx++;
-    OnComplete(GenResult(Op::Shutdown));
+    GenResult(Op::Shutdown);
+    OnComplete();
   }
 
 private:
   std::optional<size_t> &DetachOpIdx;
   std::optional<size_t> &ShutdownOpIdx;
   size_t &OpIdx;
-  move_only_function<Error(Op)> GenResult;
+  move_only_function<void(Op)> GenResult;
+};
+
+class ConfigurableService : public Service {
+public:
+  ConfigurableService(int ConstructorOption) {}
+
+  void onDetach(OnCompleteFn OnComplete) override { OnComplete(); }
+
+  void onShutdown(OnCompleteFn OnComplete) override { OnComplete(); }
+
+  void doMoreConfig(int) noexcept {}
 };
 
 class NoDispatcher : public TaskDispatcher {
@@ -292,15 +305,15 @@ TEST(SessionTest, DispatchTask) {
   EXPECT_EQ(X, 1);
 }
 
-TEST(SessionTest, SingleResourceManager) {
+TEST(SessionTest, SingleService) {
   size_t OpIdx = 0;
   std::optional<size_t> DetachOpIdx;
   std::optional<size_t> ShutdownOpIdx;
 
   {
     Session S(std::make_unique<NoDispatcher>(), noErrors);
-    S.addResourceManager(std::make_unique<MockResourceManager>(
-        DetachOpIdx, ShutdownOpIdx, OpIdx));
+    S.addService(
+        std::make_unique<MockService>(DetachOpIdx, ShutdownOpIdx, OpIdx));
   }
 
   EXPECT_EQ(OpIdx, 1U);
@@ -308,7 +321,7 @@ TEST(SessionTest, SingleResourceManager) {
   EXPECT_THAT(ShutdownOpIdx, Optional(Eq(0)));
 }
 
-TEST(SessionTest, MultipleResourceManagers) {
+TEST(SessionTest, MultipleServices) {
   size_t OpIdx = 0;
   std::optional<size_t> DetachOpIdx[3];
   std::optional<size_t> ShutdownOpIdx[3];
@@ -316,8 +329,8 @@ TEST(SessionTest, MultipleResourceManagers) {
   {
     Session S(std::make_unique<NoDispatcher>(), noErrors);
     for (size_t I = 0; I != 3; ++I)
-      S.addResourceManager(std::make_unique<MockResourceManager>(
-          DetachOpIdx[I], ShutdownOpIdx[I], OpIdx));
+      S.addService(std::make_unique<MockService>(DetachOpIdx[I],
+                                                 ShutdownOpIdx[I], OpIdx));
   }
 
   EXPECT_EQ(OpIdx, 3U);
@@ -330,7 +343,7 @@ TEST(SessionTest, MultipleResourceManagers) {
 
 TEST(SessionTest, ExpectedShutdownSequence) {
   // Check that Session shutdown results in...
-  // 1. ResourceManagers being shut down.
+  // 1. Services being shut down.
   // 2. The TaskDispatcher being shut down.
   // 3. A call to OnShutdownComplete.
 
@@ -350,8 +363,8 @@ TEST(SessionTest, ExpectedShutdownSequence) {
                   DispatcherShutDown = true;
                 }),
             noErrors);
-  S.addResourceManager(
-      std::make_unique<MockResourceManager>(DetachOpIdx, ShutdownOpIdx, OpIdx));
+  S.addService(
+      std::make_unique<MockService>(DetachOpIdx, ShutdownOpIdx, OpIdx));
 
   S.shutdown([&]() {
     EXPECT_TRUE(DispatcherShutDown);
@@ -362,6 +375,50 @@ TEST(SessionTest, ExpectedShutdownSequence) {
   EXPECT_TRUE(SessionShutdownComplete);
 }
 
+TEST(SessionTest, AddServiceAndUseRef) {
+  Session S(std::make_unique<NoDispatcher>(), noErrors);
+  auto &CS = S.addService(std::make_unique<ConfigurableService>(42));
+  CS.doMoreConfig(1);
+}
+
+TEST(SessionTest, CreateServiceAndUseRef) {
+  Session S(std::make_unique<NoDispatcher>(), noErrors);
+  auto &CS = S.createService<ConfigurableService>(42);
+  CS.doMoreConfig(1);
+}
+
+TEST(SessionTest, ControllerInterfaceContainsSessionByDefault) {
+  Session S(std::make_unique<NoDispatcher>(), noErrors);
+  ASSERT_TRUE(S.controllerInterface()->count("orc_rt_SessionInstance"));
+  EXPECT_EQ(S.controllerInterface()->at("orc_rt_SessionInstance"),
+            static_cast<void *>(&S));
+}
+
+TEST(SessionTest, ControllerInterfaceWithRef) {
+  Session S(std::make_unique<NoDispatcher>(), noErrors);
+  int X = 0, Y = 0;
+  S.controllerInterface().with_ref([&](ControllerInterface &CI) {
+    std::pair<const char *, void *> Syms[] = {
+        {"orc_rt_A", static_cast<void *>(&X)},
+        {"orc_rt_B", static_cast<void *>(&Y)}};
+    cantFail(CI.addSymbolsUnique(Syms));
+  });
+
+  EXPECT_EQ(S.controllerInterface()->at("orc_rt_A"), &X);
+  EXPECT_EQ(S.controllerInterface()->at("orc_rt_B"), &Y);
+}
+
+TEST(SessionTest, ControllerInterfaceConstAccess) {
+  Session S(std::make_unique<NoDispatcher>(), noErrors);
+  int X = 0;
+  std::pair<const char *, void *> Syms[] = {{"orc_rt_X", &X}};
+  cantFail(S.controllerInterface()->addSymbolsUnique(Syms));
+
+  const Session &CS = S;
+  ASSERT_TRUE(CS.controllerInterface()->count("orc_rt_X"));
+  EXPECT_EQ(CS.controllerInterface()->at("orc_rt_X"), &X);
+}
+
 TEST(ControllerAccessTest, Basics) {
   // Test that we can set the ControllerAccess implementation and still shut
   // down as expected.
diff --git a/orc-rt/unittests/SimpleNativeMemoryMapSPSCITest.cpp b/orc-rt/unittests/SimpleNativeMemoryMapSPSCITest.cpp
new file mode 100644
index 0000000000000..d8881030b6784
--- /dev/null
+++ b/orc-rt/unittests/SimpleNativeMemoryMapSPSCITest.cpp
@@ -0,0 +1,323 @@
+//===- SimpleNativeMemoryMapSPSControllerInterfaceTest.cpp ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for SimpleNativeMemoryMap's SPS Controller Interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/sps-ci/SimpleNativeMemoryMapSPSCI.h"
+#include "orc-rt/SPSAllocAction.h"
+#include "orc-rt/SPSMemoryFlags.h"
+#include "orc-rt/SPSWrapperFunction.h"
+#include "orc-rt/SimpleNativeMemoryMap.h"
+
+#include "AllocActionTestUtils.h"
+#include "CommonTestUtils.h"
+#include "DirectCaller.h"
+#include "gtest/gtest.h"
+
+using namespace orc_rt;
+
+namespace orc_rt {
+
+struct SPSSimpleNativeMemoryMapSegment;
+struct SPSSimpleNativeMemoryMapInitializeRequest;
+
+/// A SimpleNativeMemoryMap::InitializeRequest::Segment plus segment content (if
+/// segment content type is regular).
+struct TestSNMMSegment
+    : public SimpleNativeMemoryMap::InitializeRequest::Segment {
+
+  TestSNMMSegment(AllocGroup AG, char *Address, size_t Size,
+                  std::vector<char> C = {})
+      : SimpleNativeMemoryMap::InitializeRequest::Segment(
+            {AG, Address, Size, {}}),
+        OwnedContent(std::move(C)) {
+    this->Content = {OwnedContent.data(), OwnedContent.size()};
+  }
+
+  std::vector<char> OwnedContent;
+};
+
+template <>
+class SPSSerializationTraits<SPSSimpleNativeMemoryMapSegment, TestSNMMSegment> {
+  using SPSType =
+      SPSTuple<SPSAllocGroup, SPSExecutorAddr, uint64_t, SPSSequence<char>>;
+
+public:
+  static size_t size(const TestSNMMSegment &S) {
+    return SPSType::AsArgList::size(S.AG, ExecutorAddr::fromPtr(S.Address),
+                                    static_cast<uint64_t>(S.Size), S.Content);
+  }
+
+  static bool serialize(SPSOutputBuffer &OB, const TestSNMMSegment &S) {
+    return SPSType::AsArgList::serialize(
+        OB, S.AG, ExecutorAddr::fromPtr(S.Address),
+        static_cast<uint64_t>(S.Size), S.Content);
+  }
+};
+
+struct TestSNMMInitializeRequest {
+  std::vector<TestSNMMSegment> Segments;
+  std::vector<AllocActionPair> AAPs;
+};
+
+template <>
+class SPSSerializationTraits<SPSSimpleNativeMemoryMapInitializeRequest,
+                             TestSNMMInitializeRequest> {
+  using SPSType = SPSTuple<SPSSequence<SPSSimpleNativeMemoryMapSegment>,
+                           SPSSequence<SPSAllocActionPair>>;
+
+public:
+  static size_t size(const TestSNMMInitializeRequest &IR) {
+    return SPSType::AsArgList::size(IR.Segments, IR.AAPs);
+  }
+  static bool serialize(SPSOutputBuffer &OB,
+                        const TestSNMMInitializeRequest &IR) {
+    return SPSType::AsArgList::serialize(OB, IR.Segments, IR.AAPs);
+  }
+};
+
+} // namespace orc_rt
+
+// Write the given value to the address pointed to by P.
+static orc_rt_WrapperFunctionBuffer
+write_value_sps_allocaction(const char *ArgData, size_t ArgSize) {
+  return SPSAllocActionFunction<SPSExecutorAddr, uint64_t>::handle(
+             ArgData, ArgSize,
+             [](ExecutorAddr P, uint64_t Val) {
+               *P.toPtr<uint64_t *>() = Val;
+               return WrapperFunctionBuffer();
+             })
+      .release();
+}
+
+// Read the uint64_t value at Src and write it to Dst.
+static orc_rt_WrapperFunctionBuffer
+read_value_sps_allocaction(const char *ArgData, size_t ArgSize) {
+  return SPSAllocActionFunction<SPSExecutorAddr, SPSExecutorAddr>::handle(
+             ArgData, ArgSize,
+             [](ExecutorAddr Dst, ExecutorAddr Src) {
+               *Dst.toPtr<uint64_t *>() = *Src.toPtr<uint64_t *>();
+               return WrapperFunctionBuffer();
+             })
+      .release();
+}
+
+class SimpleNativeMemoryMapSPSCITest : public ::testing::Test {
+protected:
+  void SetUp() override {
+    cantFail(sps_ci::addSimpleNativeMemoryMap(CI));
+    SNMM = std::make_unique<SimpleNativeMemoryMap>();
+  }
+
+  void TearDown() override {
+    if (SNMM) {
+      std::future<void> F;
+      SNMM->onShutdown(waitFor(F));
+      F.get();
+    }
+  }
+
+  DirectCaller caller(const char *Name) {
+    return DirectCaller(nullptr, reinterpret_cast<orc_rt_WrapperFunction>(
+                                     const_cast<void *>(CI.at(Name))));
+  }
+
+  template <typename OnCompleteFn>
+  void spsReserve(OnCompleteFn &&OnComplete, size_t Size) {
+    using SPSSig = SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSSize);
+    SPSWrapperFunction<SPSSig>::call(
+        caller("orc_rt_sps_ci_SimpleNativeMemoryMap_reserve_sps_wrapper"),
+        std::forward<OnCompleteFn>(OnComplete), SNMM.get(), Size);
+  }
+
+  template <typename OnCompleteFn>
+  void spsReleaseMultiple(OnCompleteFn &&OnComplete, span<void *> Addrs) {
+    using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
+    SPSWrapperFunction<SPSSig>::call(
+        caller(
+            "orc_rt_sps_ci_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper"),
+        std::forward<OnCompleteFn>(OnComplete), SNMM.get(), Addrs);
+  }
+
+  template <typename OnCompleteFn>
+  void spsInitialize(OnCompleteFn &&OnComplete, TestSNMMInitializeRequest IR) {
+    using SPSSig = SPSExpected<SPSExecutorAddr>(
+        SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest);
+    SPSWrapperFunction<SPSSig>::call(
+        caller("orc_rt_sps_ci_SimpleNativeMemoryMap_initialize_sps_wrapper"),
+        std::forward<OnCompleteFn>(OnComplete), SNMM.get(), std::move(IR));
+  }
+
+  template <typename OnCompleteFn>
+  void spsDeinitializeMultiple(OnCompleteFn &&OnComplete, span<void *> Bases) {
+    using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
+    SPSWrapperFunction<SPSSig>::call(
+        caller("orc_rt_sps_ci_SimpleNativeMemoryMap_deinitializeMultiple_sps_"
+               "wrapper"),
+        std::forward<OnCompleteFn>(OnComplete), SNMM.get(), Bases);
+  }
+
+  ControllerInterface CI;
+  std::unique_ptr<SimpleNativeMemoryMap> SNMM;
+};
+
+TEST_F(SimpleNativeMemoryMapSPSCITest, Registration) {
+  EXPECT_TRUE(
+      CI.count("orc_rt_sps_ci_SimpleNativeMemoryMap_reserve_sps_wrapper"));
+  EXPECT_TRUE(CI.count(
+      "orc_rt_sps_ci_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper"));
+  EXPECT_TRUE(
+      CI.count("orc_rt_sps_ci_SimpleNativeMemoryMap_initialize_sps_wrapper"));
+  EXPECT_TRUE(CI.count(
+      "orc_rt_sps_ci_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper"));
+}
+
+TEST_F(SimpleNativeMemoryMapSPSCITest, ReserveAndRelease) {
+  std::future<Expected<Expected<void *>>> ReserveAddr;
+  spsReserve(waitFor(ReserveAddr), 1024 * 1024 * 1024);
+  auto *Addr = cantFail(cantFail(ReserveAddr.get()));
+
+  std::future<Expected<Error>> ReleaseResult;
+  spsReleaseMultiple(waitFor(ReleaseResult), {&Addr, 1});
+  cantFail(cantFail(ReleaseResult.get()));
+}
+
+TEST_F(SimpleNativeMemoryMapSPSCITest, FullPipelineForOneRWSegment) {
+  std::future<Expected<Expected<void *>>> ReserveAddr;
+  spsReserve(waitFor(ReserveAddr), 1024 * 1024 * 1024);
+  void *Addr = cantFail(cantFail(ReserveAddr.get()));
+
+  std::future<Expected<Expected<void *>>> InitializeKey;
+  TestSNMMInitializeRequest IR;
+  char *InitializeBase = reinterpret_cast<char *>(Addr) + 64 * 1024;
+  uint64_t SentinelValue1 = 0;
+  uint64_t SentinelValue2 = 0;
+  uint64_t SentinelValue3 = 42;
+
+  std::vector<char> Content;
+  Content.resize(sizeof(uint64_t) * 2);
+  memcpy(Content.data(), &SentinelValue3, sizeof(uint64_t));
+  memcpy(Content.data() + sizeof(uint64_t), &SentinelValue1, sizeof(uint64_t));
+
+  IR.Segments.push_back({MemProt::Read | MemProt::Write, InitializeBase,
+                         64 * 1024, std::move(Content)});
+
+  IR.AAPs.push_back(
+      {*MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from(
+           read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue1),
+           ExecutorAddr::fromPtr(InitializeBase)),
+       {}});
+
+  IR.AAPs.push_back(
+      {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from(
+           write_value_sps_allocaction,
+           ExecutorAddr::fromPtr(InitializeBase) + sizeof(uint64_t),
+           uint64_t(42)),
+       *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from(
+           read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue2),
+           ExecutorAddr::fromPtr(InitializeBase) + sizeof(uint64_t))});
+
+  IR.AAPs.push_back(
+      {*MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from(
+           read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue3),
+           ExecutorAddr::fromPtr(InitializeBase) + sizeof(uint64_t) * 2),
+       {}});
+
+  spsInitialize(waitFor(InitializeKey), std::move(IR));
+  void *InitializeKeyAddr = cantFail(cantFail(InitializeKey.get()));
+
+  EXPECT_EQ(SentinelValue1, 42U);
+  EXPECT_EQ(SentinelValue2, 0U);
+  EXPECT_EQ(SentinelValue3, 0U);
+
+  std::future<Expected<Error>> DeallocResult;
+  spsDeinitializeMultiple(waitFor(DeallocResult), {&InitializeKeyAddr, 1});
+  cantFail(cantFail(DeallocResult.get()));
+
+  EXPECT_EQ(SentinelValue1, 42U);
+  EXPECT_EQ(SentinelValue2, 42U);
+  EXPECT_EQ(SentinelValue3, 0U);
+
+  std::future<Expected<Error>> ReleaseResult;
+  spsReleaseMultiple(waitFor(ReleaseResult), {&Addr, 1});
+  cantFail(cantFail(ReleaseResult.get()));
+}
+
+TEST_F(SimpleNativeMemoryMapSPSCITest, ReserveInitializeShutdown) {
+  std::future<Expected<Expected<void *>>> ReserveAddr;
+  spsReserve(waitFor(ReserveAddr), 1024 * 1024 * 1024);
+  void *Addr = cantFail(cantFail(ReserveAddr.get()));
+
+  std::future<Expected<Expected<void *>>> InitializeKey;
+  TestSNMMInitializeRequest IR;
+  char *InitializeBase = reinterpret_cast<char *>(Addr) + 64 * 1024;
+  uint64_t SentinelValue = 0;
+
+  IR.Segments.push_back(
+      {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024});
+
+  IR.AAPs.push_back(
+      {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from(
+           write_value_sps_allocaction, ExecutorAddr::fromPtr(InitializeBase),
+           uint64_t(42)),
+       *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from(
+           read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue),
+           ExecutorAddr::fromPtr(InitializeBase))});
+  spsInitialize(waitFor(InitializeKey), std::move(IR));
+  cantFail(cantFail(InitializeKey.get()));
+
+  EXPECT_EQ(SentinelValue, 0U);
+
+  std::future<void> ShutdownResult;
+  SNMM->onShutdown(waitFor(ShutdownResult));
+  ShutdownResult.get();
+  SNMM.reset();
+
+  EXPECT_EQ(SentinelValue, 42);
+}
+
+TEST_F(SimpleNativeMemoryMapSPSCITest, ReserveInitializeDetachShutdown) {
+  std::future<Expected<Expected<void *>>> ReserveAddr;
+  spsReserve(waitFor(ReserveAddr), 1024 * 1024 * 1024);
+  void *Addr = cantFail(cantFail(ReserveAddr.get()));
+
+  std::future<Expected<Expected<void *>>> InitializeKey;
+  TestSNMMInitializeRequest IR;
+  char *InitializeBase = reinterpret_cast<char *>(Addr) + 64 * 1024;
+  uint64_t SentinelValue = 0;
+
+  IR.Segments.push_back(
+      {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024});
+
+  IR.AAPs.push_back(
+      {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from(
+           write_value_sps_allocaction, ExecutorAddr::fromPtr(InitializeBase),
+           uint64_t(42)),
+       *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from(
+           read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue),
+           ExecutorAddr::fromPtr(InitializeBase))});
+  spsInitialize(waitFor(InitializeKey), std::move(IR));
+  cantFail(cantFail(InitializeKey.get()));
+
+  EXPECT_EQ(SentinelValue, 0U);
+
+  std::future<void> DetachResult;
+  SNMM->onDetach(waitFor(DetachResult));
+  DetachResult.get();
+
+  EXPECT_EQ(SentinelValue, 0);
+
+  std::future<void> ShutdownResult;
+  SNMM->onShutdown(waitFor(ShutdownResult));
+  ShutdownResult.get();
+  SNMM.reset();
+
+  EXPECT_EQ(SentinelValue, 42);
+}
diff --git a/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp b/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp
index 0191c2e27cd21..adb590ad69bf2 100644
--- a/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp
+++ b/orc-rt/unittests/SimpleNativeMemoryMapTest.cpp
@@ -1,4 +1,4 @@
-//===-- SPSNativeMemoryMapTest.cpp ----------------------------------------===//
+//===-- SimpleNativeMemoryMapTest.cpp -------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,154 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Test SPS serialization for MemoryFlags APIs.
+// Test SimpleNativeMemoryMap APIs.
 //
 //===----------------------------------------------------------------------===//
 
 #include "orc-rt/SimpleNativeMemoryMap.h"
 #include "orc-rt/SPSAllocAction.h"
-#include "orc-rt/SPSMemoryFlags.h"
 
 #include "AllocActionTestUtils.h"
-#include "DirectCaller.h"
+#include "CommonTestUtils.h"
 #include "gtest/gtest.h"
 
-#include <future>
+#include <cstring>
 
 using namespace orc_rt;
 
-namespace orc_rt {
-
-struct SPSSimpleNativeMemoryMapSegment;
-
-/// A SimpleNativeMemoryMap::InitializeRequest::Segment plus segment content (if
-/// segment content type is regular).
-struct TestSNMMSegment
-    : public SimpleNativeMemoryMap::InitializeRequest::Segment {
-
-  TestSNMMSegment(AllocGroup AG, char *Address, size_t Size,
-                  std::vector<char> C = {})
-      : SimpleNativeMemoryMap::InitializeRequest::Segment(
-            {AG, Address, Size, {}}),
-        OwnedContent(std::move(C)) {
-    this->Content = {OwnedContent.data(), OwnedContent.size()};
-  }
-
-  std::vector<char> OwnedContent;
-};
-
-template <>
-class SPSSerializationTraits<SPSSimpleNativeMemoryMapSegment, TestSNMMSegment> {
-  using SPSType =
-      SPSTuple<SPSAllocGroup, SPSExecutorAddr, uint64_t, SPSSequence<char>>;
-
-public:
-  static size_t size(const TestSNMMSegment &S) {
-    return SPSType::AsArgList::size(S.AG, ExecutorAddr::fromPtr(S.Address),
-                                    static_cast<uint64_t>(S.Size), S.Content);
-  }
-
-  static bool serialize(SPSOutputBuffer &OB, const TestSNMMSegment &S) {
-    return SPSType::AsArgList::serialize(
-        OB, S.AG, ExecutorAddr::fromPtr(S.Address),
-        static_cast<uint64_t>(S.Size), S.Content);
-  }
-};
-
-struct SPSSimpleNativeMemoryMapInitializeRequest;
-
-struct TestSNMMInitializeRequest {
-  std::vector<TestSNMMSegment> Segments;
-  std::vector<AllocActionPair> AAPs;
-};
-
-template <>
-class SPSSerializationTraits<SPSSimpleNativeMemoryMapInitializeRequest,
-                             TestSNMMInitializeRequest> {
-  using SPSType = SPSTuple<SPSSequence<SPSSimpleNativeMemoryMapSegment>,
-                           SPSSequence<SPSAllocActionPair>>;
-
-public:
-  static size_t size(const TestSNMMInitializeRequest &IR) {
-    return SPSType::AsArgList::size(IR.Segments, IR.AAPs);
-  }
-  static bool serialize(SPSOutputBuffer &OB,
-                        const TestSNMMInitializeRequest &IR) {
-    return SPSType::AsArgList::serialize(OB, IR.Segments, IR.AAPs);
-  }
-};
-
-} // namespace orc_rt
-
-template <typename T> move_only_function<void(T)> waitFor(std::future<T> &F) {
-  std::promise<T> P;
-  F = P.get_future();
-  return [P = std::move(P)](T Val) mutable { P.set_value(std::move(Val)); };
-}
-
-TEST(SimpleNativeMemoryMapTest, CreateAndDestroy) {
-  // Test that we can create and destroy a SimpleNativeMemoryMap instance as
-  // expected.
-  auto SNMM = std::make_unique<SimpleNativeMemoryMap>();
-}
-
-template <typename OnCompleteFn>
-static void snmm_reserve(OnCompleteFn &&OnComplete,
-                         SimpleNativeMemoryMap *Instance, size_t Size) {
-  using SPSSig = SPSExpected<SPSExecutorAddr>(SPSExecutorAddr, SPSSize);
-  SPSWrapperFunction<SPSSig>::call(
-      DirectCaller(nullptr, orc_rt_SimpleNativeMemoryMap_reserve_sps_wrapper),
-      std::forward<OnCompleteFn>(OnComplete), Instance, Size);
-}
-
-template <typename OnCompleteFn>
-static void snmm_releaseMultiple(OnCompleteFn &&OnComplete,
-                                 SimpleNativeMemoryMap *Instance,
-                                 span<void *> Addr) {
-  using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
-  SPSWrapperFunction<SPSSig>::call(
-      DirectCaller(nullptr,
-                   orc_rt_SimpleNativeMemoryMap_releaseMultiple_sps_wrapper),
-      std::forward<OnCompleteFn>(OnComplete), Instance, Addr);
-}
-
-template <typename OnCompleteFn>
-static void snmm_initialize(OnCompleteFn &&OnComplete,
-                            SimpleNativeMemoryMap *Instance,
-                            TestSNMMInitializeRequest IR) {
-  using SPSSig = SPSExpected<SPSExecutorAddr>(
-      SPSExecutorAddr, SPSSimpleNativeMemoryMapInitializeRequest);
-  SPSWrapperFunction<SPSSig>::call(
-      DirectCaller(nullptr,
-                   orc_rt_SimpleNativeMemoryMap_initialize_sps_wrapper),
-      std::forward<OnCompleteFn>(OnComplete), Instance, std::move(IR));
-}
-
-template <typename OnCompleteFn>
-static void snmm_deinitializeMultiple(OnCompleteFn &&OnComplete,
-                                      SimpleNativeMemoryMap *Instance,
-                                      span<void *> Base) {
-  using SPSSig = SPSError(SPSExecutorAddr, SPSSequence<SPSExecutorAddr>);
-  SPSWrapperFunction<SPSSig>::call(
-      DirectCaller(
-          nullptr,
-          orc_rt_SimpleNativeMemoryMap_deinitializeMultiple_sps_wrapper),
-      std::forward<OnCompleteFn>(OnComplete), Instance, Base);
-}
-
-TEST(SimpleNativeMemoryMapTest, ReserveAndRelease) {
-  // Test that we can reserve and release a slab of address space as expected,
-  // without finalizing any memory within it.
-  auto SNMM = std::make_unique<SimpleNativeMemoryMap>();
-  std::future<Expected<Expected<void *>>> ReserveAddr;
-  snmm_reserve(waitFor(ReserveAddr), SNMM.get(), 1024 * 1024 * 1024);
-  auto Addr = cantFail(cantFail(ReserveAddr.get()));
-
-  std::future<Expected<Error>> ReleaseResult;
-  snmm_releaseMultiple(waitFor(ReleaseResult), SNMM.get(), {&Addr, 1});
-  cantFail(cantFail(ReleaseResult.get()));
-}
-
 // Write the given value to the address pointed to by P.
 static orc_rt_WrapperFunctionBuffer
 write_value_sps_allocaction(const char *ArgData, size_t ArgSize) {
@@ -167,7 +34,6 @@ write_value_sps_allocaction(const char *ArgData, size_t ArgSize) {
 }
 
 // Read the uint64_t value at Src and write it to Dst.
-// Increments int via pointer.
 static orc_rt_WrapperFunctionBuffer
 read_value_sps_allocaction(const char *ArgData, size_t ArgSize) {
   return SPSAllocActionFunction<SPSExecutorAddr, SPSExecutorAddr>::handle(
@@ -179,7 +45,27 @@ read_value_sps_allocaction(const char *ArgData, size_t ArgSize) {
       .release();
 }
 
-TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) {
+TEST(SimpleNativeMemoryMapTest, CreateAndDestroy) {
+  // Test that we can create and destroy a SimpleNativeMemoryMap instance as
+  // expected.
+  auto SNMM = std::make_unique<SimpleNativeMemoryMap>();
+}
+
+TEST(SimpleNativeMemoryMapTest, ReserveAndRelease) {
+  // Test that we can reserve and release a slab of address space as expected,
+  // without finalizing any memory within it.
+  SimpleNativeMemoryMap SNMM;
+
+  std::future<Expected<void *>> ReserveResult;
+  SNMM.reserve(waitFor(ReserveResult), 1024 * 1024 * 1024);
+  void *Addr = cantFail(ReserveResult.get());
+
+  std::future<Error> ReleaseResult;
+  SNMM.releaseMultiple(waitFor(ReleaseResult), {Addr});
+  cantFail(ReleaseResult.get());
+}
+
+TEST(SimpleNativeMemoryMapTest, FullPipelineForOneRWSegment) {
   // Test that we can:
   // 1. reserve some address space.
   // 2. initialize a range within it as read/write, and that finalize actions
@@ -188,13 +74,12 @@ TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) {
   //    expected.
   // 4. release the address range.
 
-  auto SNMM = std::make_unique<SimpleNativeMemoryMap>();
-  std::future<Expected<Expected<void *>>> ReserveAddr;
-  snmm_reserve(waitFor(ReserveAddr), SNMM.get(), 1024 * 1024 * 1024);
-  void *Addr = cantFail(cantFail(ReserveAddr.get()));
+  SimpleNativeMemoryMap SNMM;
+
+  std::future<Expected<void *>> ReserveResult;
+  SNMM.reserve(waitFor(ReserveResult), 1024 * 1024 * 1024);
+  void *Addr = cantFail(ReserveResult.get());
 
-  std::future<Expected<Expected<void *>>> InitializeKey;
-  TestSNMMInitializeRequest IR;
   char *InitializeBase = // Initialize addr at non-zero (64kb) offset from base.
       reinterpret_cast<char *>(Addr) + 64 * 1024;
   uint64_t SentinelValue1 = 0; // Read from pre-filled content
@@ -208,8 +93,11 @@ TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) {
   memcpy(Content.data(), &SentinelValue3, sizeof(uint64_t));
   memcpy(Content.data() + sizeof(uint64_t), &SentinelValue1, sizeof(uint64_t));
 
-  IR.Segments.push_back({MemProt::Read | MemProt::Write, InitializeBase,
-                         64 * 1024, std::move(Content)});
+  SimpleNativeMemoryMap::InitializeRequest IR;
+  IR.Segments.push_back({MemProt::Read | MemProt::Write,
+                         InitializeBase,
+                         64 * 1024,
+                         {Content.data(), Content.size()}});
 
   // Read initial content into Sentinel 1.
   IR.AAPs.push_back({
@@ -237,44 +125,44 @@ TEST(SimpleNativeMemoryMap, FullPipelineForOneRWSegment) {
       {} // No dealloc action.
   });
 
-  snmm_initialize(waitFor(InitializeKey), SNMM.get(), std::move(IR));
-  void *InitializeKeyAddr = cantFail(cantFail(InitializeKey.get()));
+  std::future<Expected<void *>> InitializeResult;
+  SNMM.initialize(waitFor(InitializeResult), std::move(IR));
+  void *InitializeKeyAddr = cantFail(InitializeResult.get());
 
   EXPECT_EQ(SentinelValue1, 42U);
   EXPECT_EQ(SentinelValue2, 0U);
   EXPECT_EQ(SentinelValue3, 0U);
 
-  std::future<Expected<Error>> DeallocResult;
-  snmm_deinitializeMultiple(waitFor(DeallocResult), SNMM.get(),
-                            {&InitializeKeyAddr, 1});
-  cantFail(cantFail(DeallocResult.get()));
+  std::future<Error> DeallocResult;
+  SNMM.deinitializeMultiple(waitFor(DeallocResult), {InitializeKeyAddr});
+  cantFail(DeallocResult.get());
 
   EXPECT_EQ(SentinelValue1, 42U);
   EXPECT_EQ(SentinelValue2, 42U);
   EXPECT_EQ(SentinelValue3, 0U);
 
-  std::future<Expected<Error>> ReleaseResult;
-  snmm_releaseMultiple(waitFor(ReleaseResult), SNMM.get(), {&Addr, 1});
-  cantFail(cantFail(ReleaseResult.get()));
+  std::future<Error> ReleaseResult;
+  SNMM.releaseMultiple(waitFor(ReleaseResult), {Addr});
+  cantFail(ReleaseResult.get());
 }
 
-TEST(SimpleNativeMemoryMap, ReserveInitializeShutdown) {
+TEST(SimpleNativeMemoryMapTest, ReserveInitializeShutdown) {
   // Test that memory is deinitialized in the case where we reserve and
   // initialize some memory, then just shut down the memory manager.
 
-  auto SNMM = std::make_unique<SimpleNativeMemoryMap>();
-  std::future<Expected<Expected<void *>>> ReserveAddr;
-  snmm_reserve(waitFor(ReserveAddr), SNMM.get(), 1024 * 1024 * 1024);
-  void *Addr = cantFail(cantFail(ReserveAddr.get()));
+  SimpleNativeMemoryMap SNMM;
+
+  std::future<Expected<void *>> ReserveResult;
+  SNMM.reserve(waitFor(ReserveResult), 1024 * 1024 * 1024);
+  void *Addr = cantFail(ReserveResult.get());
 
-  std::future<Expected<Expected<void *>>> InitializeKey;
-  TestSNMMInitializeRequest IR;
   char *InitializeBase = // Initialize addr at non-zero (64kb) offset from base.
       reinterpret_cast<char *>(Addr) + 64 * 1024;
   uint64_t SentinelValue = 0;
 
+  SimpleNativeMemoryMap::InitializeRequest IR;
   IR.Segments.push_back(
-      {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024});
+      {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024, {}});
 
   IR.AAPs.push_back(
       {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from(
@@ -283,35 +171,37 @@ TEST(SimpleNativeMemoryMap, ReserveInitializeShutdown) {
        *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from(
            read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue),
            ExecutorAddr::fromPtr(InitializeBase))});
-  snmm_initialize(waitFor(InitializeKey), SNMM.get(), std::move(IR));
-  cantFail(cantFail(InitializeKey.get()));
+
+  std::future<Expected<void *>> InitializeResult;
+  SNMM.initialize(waitFor(InitializeResult), std::move(IR));
+  cantFail(InitializeResult.get());
 
   EXPECT_EQ(SentinelValue, 0U);
 
-  std::future<Error> ShutdownResult;
-  SNMM->onShutdown(waitFor(ShutdownResult));
-  cantFail(ShutdownResult.get());
+  std::future<void> ShutdownResult;
+  SNMM.onShutdown(waitFor(ShutdownResult));
+  ShutdownResult.get();
 
   EXPECT_EQ(SentinelValue, 42);
 }
 
-TEST(SimpleNativeMemoryMap, ReserveInitializeDetachShutdown) {
+TEST(SimpleNativeMemoryMapTest, ReserveInitializeDetachShutdown) {
   // Test that memory is deinitialized in the case where we reserve and
   // initialize some memory, then just shut down the memory manager.
 
-  auto SNMM = std::make_unique<SimpleNativeMemoryMap>();
-  std::future<Expected<Expected<void *>>> ReserveAddr;
-  snmm_reserve(waitFor(ReserveAddr), SNMM.get(), 1024 * 1024 * 1024);
-  void *Addr = cantFail(cantFail(ReserveAddr.get()));
+  SimpleNativeMemoryMap SNMM;
+
+  std::future<Expected<void *>> ReserveResult;
+  SNMM.reserve(waitFor(ReserveResult), 1024 * 1024 * 1024);
+  void *Addr = cantFail(ReserveResult.get());
 
-  std::future<Expected<Expected<void *>>> InitializeKey;
-  TestSNMMInitializeRequest IR;
   char *InitializeBase = // Initialize addr at non-zero (64kb) offset from base.
       reinterpret_cast<char *>(Addr) + 64 * 1024;
   uint64_t SentinelValue = 0;
 
+  SimpleNativeMemoryMap::InitializeRequest IR;
   IR.Segments.push_back(
-      {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024});
+      {MemProt::Read | MemProt::Write, InitializeBase, 64 * 1024, {}});
 
   IR.AAPs.push_back(
       {*MakeAllocAction<SPSExecutorAddr, uint64_t>::from(
@@ -320,20 +210,22 @@ TEST(SimpleNativeMemoryMap, ReserveInitializeDetachShutdown) {
        *MakeAllocAction<SPSExecutorAddr, SPSExecutorAddr>::from(
            read_value_sps_allocaction, ExecutorAddr::fromPtr(&SentinelValue),
            ExecutorAddr::fromPtr(InitializeBase))});
-  snmm_initialize(waitFor(InitializeKey), SNMM.get(), std::move(IR));
-  cantFail(cantFail(InitializeKey.get()));
+
+  std::future<Expected<void *>> InitializeResult;
+  SNMM.initialize(waitFor(InitializeResult), std::move(IR));
+  cantFail(InitializeResult.get());
 
   EXPECT_EQ(SentinelValue, 0U);
 
-  std::future<Error> DetachResult;
-  SNMM->onDetach(waitFor(DetachResult));
-  cantFail(DetachResult.get());
+  std::future<void> DetachResult;
+  SNMM.onDetach(waitFor(DetachResult));
+  DetachResult.get();
 
   EXPECT_EQ(SentinelValue, 0);
 
-  std::future<Error> ShutdownResult;
-  SNMM->onShutdown(waitFor(ShutdownResult));
-  cantFail(ShutdownResult.get());
+  std::future<void> ShutdownResult;
+  SNMM.onShutdown(waitFor(ShutdownResult));
+  ShutdownResult.get();
 
   EXPECT_EQ(SentinelValue, 42);
 }
diff --git a/orc-rt/unittests/iterator_range-test.cpp b/orc-rt/unittests/iterator_range-test.cpp
new file mode 100644
index 0000000000000..00262095396b7
--- /dev/null
+++ b/orc-rt/unittests/iterator_range-test.cpp
@@ -0,0 +1,79 @@
+//===- iterator_range-test.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Tests for orc-rt's iterator_range.h APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "orc-rt/iterator_range.h"
+#include "gtest/gtest.h"
+
+#include <type_traits>
+#include <vector>
+
+using namespace orc_rt;
+
+TEST(IteratorRangeTest, EmptyArray) {
+  int A[1]; // zero-length arrays aren't allowed.
+  iterator_range<int *> R(std::begin(A), std::begin(A));
+
+  EXPECT_TRUE(R.empty());
+  EXPECT_EQ(R.begin(), R.end());
+}
+
+TEST(IteratorRangeTest, NonEmptyArray) {
+  int A[] = {10, 11, 12, 13, 14, 15};
+
+  size_t Index = 0;
+  for (auto &E : iterator_range(A))
+    EXPECT_EQ(E, A[Index++]);
+}
+
+TEST(IteratorRangeTest, EmptyVector) {
+  std::vector<int> V;
+  auto R = iterator_range(V);
+  EXPECT_TRUE(R.empty());
+  EXPECT_EQ(R.begin(), R.end());
+}
+
+TEST(IteratorRangeTest, NonEmptyVector) {
+  std::vector<int> V({{10, 12, 14, 16, 18, 20}});
+
+  size_t Index = 0;
+  for (auto &E : iterator_range(V))
+    EXPECT_EQ(E, V[Index++]);
+}
+
+TEST(IteratorRangeTest, Subrange) {
+  std::vector<int> V = {1, 2, 3, 4, 5};
+  iterator_range R(V.begin() + 1, V.begin() + 4);
+
+  EXPECT_FALSE(R.empty());
+  std::vector<int> Result(R.begin(), R.end());
+  EXPECT_EQ(Result, (std::vector<int>{2, 3, 4}));
+}
+
+TEST(IteratorRangeTest, MutateThroughRange) {
+  std::vector<int> V = {1, 2, 3};
+  for (auto &E : iterator_range(V))
+    E *= 2;
+  EXPECT_EQ(V, (std::vector<int>{2, 4, 6}));
+}
+
+TEST(IteratorRangeTest, NonEmptyIsNotEmpty) {
+  std::vector<int> V = {1};
+  auto R = iterator_range(V);
+  EXPECT_FALSE(R.empty());
+}
+
+TEST(IteratorRangeTest, ConstContainer) {
+  const std::vector<int> V = {1, 2, 3};
+  for (auto &E : iterator_range(V))
+    static_assert(std::is_const_v<std::remove_reference_t<decltype(E)>>,
+                  "elements from const container should be const");
+}
diff --git a/polly/lib/Analysis/ScopBuilder.cpp b/polly/lib/Analysis/ScopBuilder.cpp
index 1ea9a554f761a..f4d654fa74cb4 100644
--- a/polly/lib/Analysis/ScopBuilder.cpp
+++ b/polly/lib/Analysis/ScopBuilder.cpp
@@ -637,7 +637,7 @@ void ScopBuilder::propagateDomainConstraintsToRegionExit(
   auto *RI = scop->getRegion().getRegionInfo();
   auto *BBReg = RI ? RI->getRegionFor(BB) : nullptr;
   auto *ExitBB = BBReg ? BBReg->getExit() : nullptr;
-  if (!BBReg || BBReg->getEntry() != BB || !scop->contains(ExitBB))
+  if (!BBReg || BBReg->getEntry() != BB || !ExitBB || !scop->contains(ExitBB))
     return;
 
   // Do not propagate the domain if there is a loop backedge inside the region
diff --git a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
index 82b255d2e43af..6c8b5d569665e 100644
--- a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
+++ b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
@@ -107,8 +107,11 @@ ParallelLoopGeneratorGOMP::createSubFn(Value *Stride, AllocaInst *StructData,
 
   // Create basic blocks.
   BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
+  // Add terminator so that DT computation doesn't fail.
+  auto *UI = new UnreachableInst(Context, HeaderBB);
   SubFnDT = std::make_unique<DominatorTree>(*SubFn);
   SubFnLI = std::make_unique<LoopInfo>(*SubFnDT);
+  UI->eraseFromParent();
 
   BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
   BasicBlock *CheckNextBB =
diff --git a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
index dfeea989f6c0b..2a8d5ea419cd5 100644
--- a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
+++ b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
@@ -131,8 +131,11 @@ ParallelLoopGeneratorKMP::createSubFn(Value *SequentialLoopStride,
 
   // Create basic blocks.
   BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
+  // Add terminator so that DT computation doesn't fail.
+  auto *UI = new UnreachableInst(Context, HeaderBB);
   SubFnDT = std::make_unique<DominatorTree>(*SubFn);
   SubFnLI = std::make_unique<LoopInfo>(*SubFnDT);
+  UI->eraseFromParent();
 
   BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
   BasicBlock *CheckNextBB =
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 8a81f661d420b..e18f5b68b9921 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1615,6 +1615,8 @@ cc_library(
         ":lex",
         ":options",
         ":parse",
+        ":scalable_static_analysis_core",
+        ":scalable_static_analysis_frontend",
         ":serialization",
         ":static_analyzer_checkers_gen",
         "//llvm:BinaryFormat",
@@ -2084,6 +2086,8 @@ cc_library(
         ":frontend",
         ":frontend_rewrite",
         ":options",
+        ":scalable_static_analysis_core",
+        ":scalable_static_analysis_frontend",
         ":static_analyzer_frontend",
         "//llvm:Option",
         "//llvm:Support",
@@ -2586,3 +2590,60 @@ cc_library(
         "//llvm:TargetParser",
     ],
 )
+
+cc_library(
+    name = "scalable_static_analysis_core",
+    srcs = glob([
+        "lib/ScalableStaticAnalysisFramework/Core/**/*.cpp",
+        "lib/ScalableStaticAnalysisFramework/Core/**/*.h",
+    ]),
+    hdrs = glob(["include/clang/ScalableStaticAnalysisFramework/Core/**/*.h"] + [
+        "include/clang/ScalableStaticAnalysisFramework/SSAFBuiltinForceLinker.h",
+        "include/clang/ScalableStaticAnalysisFramework/SSAFForceLinker.h",
+    ]),
+    textual_hdrs = glob(["include/clang/ScalableStaticAnalysisFramework/Core/**/*.def"]),
+    deps = [
+        ":ast",
+        ":support",
+        ":unified_symbol_resolution",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "scalable_static_analysis_frontend",
+    srcs = glob([
+        "lib/ScalableStaticAnalysisFramework/Frontend/**/*.cpp",
+    ]),
+    hdrs = glob(["include/clang/ScalableStaticAnalysisFramework/Frontend/**/*.h"]),
+    deps = [
+        ":ast",
+        ":basic",
+        ":frontend",
+        ":scalable_static_analysis_core",
+        ":sema",
+        "//llvm:Support",
+    ],
+)
+
+cc_binary(
+    name = "clang-ssaf-format",
+    srcs = ["tools/clang-ssaf-format/SSAFFormat.cpp"],
+    deps = [
+        ":basic",
+        ":scalable_static_analysis_core",
+        "//llvm:Option",
+        "//llvm:Support",
+    ],
+)
+
+cc_binary(
+    name = "clang-ssaf-linker",
+    srcs = ["tools/clang-ssaf-linker/SSAFLinker.cpp"],
+    deps = [
+        ":basic",
+        ":scalable_static_analysis_core",
+        "//llvm:Option",
+        "//llvm:Support",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
index 4504771157424..8a52a28032bb6 100644
--- a/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/unittests/BUILD.bazel
@@ -779,3 +779,30 @@ cc_test(
         "//third-party/unittest:gtest_main",
     ],
 )
+
+cc_test(
+    name = "scalable_static_analysis_tests",
+    size = "small",
+    srcs = glob([
+        "ScalableStaticAnalysisFramework/**/*.cpp",
+        "ScalableStaticAnalysisFramework/**/*.h",
+    ]),
+    features = ["-layering_check"],  # #include "../../lib/ScalableStaticAnalysisFramework/Core/ModelStringConversions.h"
+    includes = ["ScalableStaticAnalysisFramework"],
+    deps = [
+        "//clang:ast",
+        "//clang:ast_matchers",
+        "//clang:basic",
+        "//clang:frontend",
+        "//clang:lex",
+        "//clang:scalable_static_analysis_core",
+        "//clang:scalable_static_analysis_frontend",
+        "//clang:support",
+        "//clang:tooling",
+        "//llvm:Support",
+        "//llvm:TestingSupport",
+        "//third-party/unittest:gmock",
+        "//third-party/unittest:gtest",
+        "//third-party/unittest:gtest_main",
+    ],
+)
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 316fda92c04a6..381c57b18c8e2 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -3107,6 +3107,23 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_atanpif16",
+    hdrs = ["src/__support/math/atanpif16.h"],
+    deps = [
+        ":__support_fputil_cast",
+        ":__support_fputil_fenv_impl",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_fputil_polyeval",
+        ":__support_fputil_sqrt",
+        ":__support_macros_config",
+        ":__support_macros_optimization",
+        ":hdr_fenv_macros",
+        ":llvm_libc_macros_float16_macros",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_bf16add",
     hdrs = ["src/__support/math/bf16add.h"],
@@ -3178,6 +3195,16 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_bf16fma",
+    hdrs = ["src/__support/math/bf16fma.h"],
+    deps = [
+        ":__support_fputil_bfloat16",
+        ":__support_fputil_fma",
+        ":__support_macros_config",
+    ],
+)
+
 libc_support_library(
     name = "__support_math_bf16fmaf",
     hdrs = ["src/__support/math/bf16fmaf.h"],
@@ -5352,6 +5379,21 @@ libc_support_library(
     ],
 )
 
+libc_support_library(
+    name = "__support_math_log_bf16",
+    hdrs = ["src/__support/math/log_bf16.h"],
+    deps = [
+        ":__support_common",
+        ":__support_fputil_bfloat16",
+        ":__support_fputil_cast",
+        ":__support_fputil_fp_bits",
+        ":__support_fputil_multiply_add",
+        ":__support_macros_config",
+        ":__support_macros_optimization",
+        ":__support_macros_properties_cpu_features",
+    ],
+)
+
 libc_support_library(
     name = "__support_range_reduction_double",
     hdrs = [
@@ -6198,6 +6240,11 @@ libc_math_function(
     ],
 )
 
+libc_math_function(
+    name = "atanpif16",
+    additional_deps = [":__support_math_atanpif16"],
+)
+
 libc_math_function(
     name = "bf16add",
     additional_deps = [":__support_math_bf16add"],
@@ -6243,6 +6290,11 @@ libc_math_function(
     additional_deps = [":__support_math_bf16subf128"],
 )
 
+libc_math_function(
+    name = "bf16fma",
+    additional_deps = [":__support_math_bf16fma"],
+)
+
 libc_math_function(
     name = "bf16fmaf",
     additional_deps = [":__support_math_bf16fmaf"],
@@ -7545,6 +7597,13 @@ libc_math_function(
     ],
 )
 
+libc_math_function(
+    name = "log_bf16",
+    additional_deps = [
+        ":__support_math_log_bf16",
+    ],
+)
+
 libc_math_function(name = "lrint")
 
 libc_math_function(name = "lrintf")
diff --git a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
index 6f23c8dcb3b82..6c04c838f465f 100644
--- a/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/BUILD.bazel
@@ -139,6 +139,9 @@ expand_template(
         # TODO: treesitter support
         "#cmakedefine01 LLDB_ENABLE_TREESITTER": "#define LLDB_ENABLE_TREESITTER 0",
 
+        # Would cause tests to run with MTE.
+        "#cmakedefine01 LLDB_ENABLE_MTE": "#define LLDB_ENABLE_MTE 0",
+
         # Defaults that could be configurable if needed
         "#cmakedefine01 LLDB_ENABLE_POSIX": "#define LLDB_ENABLE_POSIX 1",
         "#cmakedefine01 LLDB_ENABLE_PROTOCOL_SERVERS": "#define LLDB_ENABLE_PROTOCOL_SERVERS 1",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 824804965a2ad..f8a6a936ce968 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -12909,6 +12909,7 @@ cc_library(
     hdrs = ["include/mlir/Dialect/Arith/Utils/Utils.h"],
     includes = ["include"],
     deps = [
+        ":AsmParser",
         ":ArithDialect",
         ":ComplexDialect",
         ":DialectUtils",
diff --git a/utils/bazel/llvm_configs/config.h.cmake b/utils/bazel/llvm_configs/config.h.cmake
index ce83de8e4cba9..94fde52de11b3 100644
--- a/utils/bazel/llvm_configs/config.h.cmake
+++ b/utils/bazel/llvm_configs/config.h.cmake
@@ -242,6 +242,9 @@
 /* Define if ICU library is available */
 #cmakedefine01 HAVE_ICU
 
+/* Define if Windows vendored ICU is available */
+#cmakedefine01 HAVE_WINDOWS_ICU
+
 /* Define if iconv library is available */
 #cmakedefine01 HAVE_ICONV