From 7d8fd4f5db0dd52cf9802889690aab876ad6646b Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 13 Jun 2022 11:47:14 +0100 Subject: [PATCH] [DAG] visitINSERT_VECTOR_ELT - attempt to reconstruct BUILD_VECTOR before other fold interfere Another issue unearthed by D127115 We take a long time to canonicalize an insert_vector_elt chain before being able to convert it into a build_vector - even if they are already in ascending insertion order, we fold the nodes one at a time into the build_vector 'seed', leaving plenty of time for other folds to alter it (in particular recognising when they come from extract_vector_elt resulting in a shuffle_vector that is much harder to fold with). D127115 makes this particularly difficult as we're almost guaranteed to have the lost the sequence before all possible insertions have been folded. This patch proposes to begin at the last insertion and attempt to collect all the (oneuse) insertions right away and create the build_vector before its too late. Differential Revision: https://reviews.llvm.org/D127595 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 35 ++++++++++++++ llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 6 +-- .../sve-fixed-length-insert-vector-elt.ll | 3 +- .../AArch64/vecreduce-propagate-sd-flags.ll | 2 +- llvm/test/CodeGen/ARM/neon-copy.ll | 5 +- .../RISCV/rvv/fixed-vectors-insert-i1.ll | 8 +--- .../RISCV/rvv/fixed-vectors-masked-gather.ll | 25 ++++------ .../test/CodeGen/X86/avx512-insert-extract.ll | 5 +- .../test/CodeGen/X86/masked_gather_scatter.ll | 48 ++++++++++--------- .../CodeGen/X86/shuffle-extract-subvector.ll | 27 +++-------- .../CodeGen/X86/sse-insertelt-from-mem.ll | 23 +++------ llvm/test/CodeGen/X86/sse-insertelt.ll | 23 ++------- llvm/test/CodeGen/X86/vec_insert-7.ll | 3 +- 13 files changed, 100 insertions(+), 113 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index ac5cf4a22a46..14b10a5448c7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -19426,6 +19426,41 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { Ops.append(NumElts, DAG.getUNDEF(InVal.getValueType())); return UpdateBuildVector(Ops); } + + // If we're inserting into the end of a vector as part of an sequence, see + // if we can create a BUILD_VECTOR by following the sequence back up the + // chain. + if (Elt == (NumElts - 1)) { + SmallVector ReverseInsertions; + ReverseInsertions.push_back(InVal); + + EVT MaxEltVT = InVal.getValueType(); + SDValue CurVec = InVec; + for (unsigned I = 1; I != NumElts; ++I) { + if (CurVec.getOpcode() != ISD::INSERT_VECTOR_ELT || !CurVec.hasOneUse()) + break; + + auto *CurIdx = dyn_cast(CurVec.getOperand(2)); + if (!CurIdx || CurIdx->getAPIntValue() != ((NumElts - 1) - I)) + break; + SDValue CurVal = CurVec.getOperand(1); + ReverseInsertions.push_back(CurVal); + if (VT.isInteger()) { + EVT CurValVT = CurVal.getValueType(); + MaxEltVT = MaxEltVT.bitsGE(CurValVT) ? MaxEltVT : CurValVT; + } + CurVec = CurVec.getOperand(0); + } + + if (ReverseInsertions.size() == NumElts) { + for (unsigned I = 0; I != NumElts; ++I) { + SDValue Val = ReverseInsertions[(NumElts - 1) - I]; + Val = VT.isInteger() ? DAG.getAnyExtOrTrunc(Val, DL, MaxEltVT) : Val; + Ops.push_back(Val); + } + return DAG.getBuildVector(VT, DL, Ops); + } + } } return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index ab398850fb92..9090fc1979fc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -258,9 +258,7 @@ define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { ; CHECK-LABEL: ins2d1: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-NEXT: mov v1.d[0], v0.d[0] -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 @@ -282,7 +280,7 @@ define <2 x float> @ins4f2(<4 x float> %tmp1, <2 x float> %tmp2) { define <1 x double> @ins2f1(<2 x double> %tmp1, <1 x double> %tmp2) { ; CHECK-LABEL: ins2f1: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v0.2d, v0.d[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %tmp3 = extractelement <2 x double> %tmp1, i32 1 diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll index 71529994be14..e67f92890c8a 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-insert-vector-elt.ll @@ -228,7 +228,8 @@ define <64 x float> @insertelement_v64f32(<64 x float>* %a) #0 { define <1 x double> @insertelement_v1f64(<1 x double> %op1) #0 { ; VBITS_GE_256-LABEL: insertelement_v1f64: ; VBITS_GE_256: // %bb.0: -; VBITS_GE_256-NEXT: fmov d0, #5.00000000 +; VBITS_GE_256-NEXT: mov x8, #4617315517961601024 +; VBITS_GE_256-NEXT: fmov d0, x8 ; VBITS_GE_256-NEXT: ret %r = insertelement <1 x double> %op1, double 5.0, i64 0 ret <1 x double> %r diff --git a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll index 94b42f1215b9..823fdfb48097 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll @@ -13,7 +13,7 @@ ; CHECK: Legally typed node: [[VTWOA]]: v2f64 = BUILD_VECTOR ; CHECK: Legalizing node: [[VTWOB:t.*]]: v2f64 = BUILD_VECTOR ; CHECK: Legally typed node: [[VTWOB]]: v2f64 = BUILD_VECTOR -; CHECK: Legalizing node: t34: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]] +; CHECK: Legalizing node: t30: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" diff --git a/llvm/test/CodeGen/ARM/neon-copy.ll b/llvm/test/CodeGen/ARM/neon-copy.ll index cbaf8c8f6bb6..31c5d65e8ae1 100644 --- a/llvm/test/CodeGen/ARM/neon-copy.ll +++ b/llvm/test/CodeGen/ARM/neon-copy.ll @@ -257,10 +257,7 @@ define <2 x i32> @ins4s2(<4 x i32> %tmp1, <2 x i32> %tmp2) { define <1 x i64> @ins2d1(<2 x i64> %tmp1, <1 x i64> %tmp2) { ; CHECK-LABEL: ins2d1: ; CHECK: @ %bb.0: -; CHECK-NEXT: vmov r0, r1, d0 -; CHECK-NEXT: vmov.32 d2[0], r0 -; CHECK-NEXT: vmov.32 d2[1], r1 -; CHECK-NEXT: vorr d0, d2, d2 +; CHECK-NEXT: @ kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: bx lr %tmp3 = extractelement <2 x i64> %tmp1, i32 0 %tmp4 = insertelement <1 x i64> %tmp2, i64 %tmp3, i32 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll index 4bc46c60a87f..51de3ab7f57d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-i1.ll @@ -7,13 +7,9 @@ define <1 x i1> @insertelt_v1i1(<1 x i1> %x, i1 %elt) nounwind { ; CHECK-LABEL: insertelt_v1i1: ; CHECK: # %bb.0: +; CHECK-NEXT: andi a0, a0, 1 ; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, mu -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, mu -; CHECK-NEXT: vmv.s.x v8, a0 -; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu -; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vmsne.vi v0, v8, 0 ; CHECK-NEXT: ret %y = insertelement <1 x i1> %x, i1 %elt, i64 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll index 65ea01fd9ebb..8b73efd7151e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -41,9 +41,8 @@ define <1 x i8> @mgather_v1i8(<1 x i8*> %ptrs, <1 x i1> %m, <1 x i8> %passthru) ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB0_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: lb a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64ZVE32F-NEXT: vlse8.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB0_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> %ptrs, i32 1, <1 x i1> %m, <1 x i8> %passthru) @@ -1012,9 +1011,8 @@ define <1 x i16> @mgather_v1i16(<1 x i16*> %ptrs, <1 x i1> %m, <1 x i16> %passth ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB13_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: lh a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB13_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> %ptrs, i32 2, <1 x i1> %m, <1 x i16> %passthru) @@ -2325,9 +2323,8 @@ define <1 x i32> @mgather_v1i32(<1 x i32*> %ptrs, <1 x i1> %m, <1 x i32> %passth ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB27_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: lw a0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu -; RV64ZVE32F-NEXT: vmv.s.x v8, a0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB27_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptrs, i32 4, <1 x i1> %m, <1 x i32> %passthru) @@ -7574,9 +7571,8 @@ define <1 x half> @mgather_v1f16(<1 x half*> %ptrs, <1 x i1> %m, <1 x half> %pas ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB58_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: flh ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, tu, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, mu +; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB58_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> %ptrs, i32 2, <1 x i1> %m, <1 x half> %passthru) @@ -8594,9 +8590,8 @@ define <1 x float> @mgather_v1f32(<1 x float*> %ptrs, <1 x i1> %m, <1 x float> % ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB68_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load -; RV64ZVE32F-NEXT: flw ft0, 0(a0) -; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, tu, mu -; RV64ZVE32F-NEXT: vfmv.s.f v8, ft0 +; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu +; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero ; RV64ZVE32F-NEXT: .LBB68_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> %ptrs, i32 4, <1 x i1> %m, <1 x float> %passthru) diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll index d131118f63f9..d307be240a29 100644 --- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll +++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll @@ -593,8 +593,9 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) { define <2 x i64> @insert_v2i64(<2 x i64> %x, i64 %y , i64* %ptr) { ; CHECK-LABEL: insert_v2i64: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm0 +; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vmovq %rdi, %xmm1 +; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; CHECK-NEXT: retq %val = load i64, i64* %ptr %r1 = insertelement <2 x i64> %x, i64 %val, i32 1 diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll index 387513db012a..af3b18aa92a4 100644 --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -2284,38 +2284,40 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; KNL_64: # %bb.0: ; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 ; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 -; KNL_64-NEXT: vmovq %rdi, %xmm2 -; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 -; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; KNL_64-NEXT: vmovq %rdi, %xmm1 +; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 +; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; KNL_64-NEXT: vmovq %xmm0, %rax -; KNL_64-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1 -; KNL_64-NEXT: vpextrq $1, %xmm0, %rax -; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0 +; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx +; KNL_64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; KNL_64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; KNL_64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test26: ; KNL_32: # %bb.0: ; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 -; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 -; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 +; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 +; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; KNL_32-NEXT: vmovd %xmm0, %eax -; KNL_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1 -; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1 -; KNL_32-NEXT: vpextrd $1, %xmm0, %eax -; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0 -; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx +; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0 +; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0 +; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0 ; KNL_32-NEXT: retl ; ; SKX-LABEL: test26: ; SKX: # %bb.0: ; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 -; SKX-NEXT: vpbroadcastq %rdi, %xmm2 +; SKX-NEXT: vpbroadcastq %rdi, %xmm1 ; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 -; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; SKX-NEXT: vmovq %xmm0, %rax -; SKX-NEXT: vpinsrq $0, (%rax), %xmm1, %xmm1 -; SKX-NEXT: vpextrq $1, %xmm0, %rax -; SKX-NEXT: vpinsrq $1, (%rax), %xmm1, %xmm0 +; SKX-NEXT: vpextrq $1, %xmm0, %rcx +; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; SKX-NEXT: retq ; ; SKX_32-LABEL: test26: @@ -2323,11 +2325,11 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) { ; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 ; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 ; SKX_32-NEXT: vmovd %xmm0, %eax -; SKX_32-NEXT: vpinsrd $0, (%eax), %xmm1, %xmm1 -; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm1, %xmm1 -; SKX_32-NEXT: vpextrd $1, %xmm0, %eax -; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm1, %xmm0 -; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 +; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx +; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0 +; SKX_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0 +; SKX_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0 ; SKX_32-NEXT: retl %sext_ind = sext <2 x i32> %ind to <2 x i64> %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind diff --git a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll index 6ddb225108fc..6921bc142f1d 100644 --- a/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll +++ b/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll @@ -4,28 +4,13 @@ define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) { ; CHECK-LABEL: f: ; CHECK: # %bb.0: -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movzwl 2(%rdi), %ecx -; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movzwl 6(%rdi), %r8d -; CHECK-NEXT: movzwl 4(%rdi), %r11d -; CHECK-NEXT: movq (%rsi), %rsi -; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq (%rdi), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: pextrw $1, %xmm0, %r9d -; CHECK-NEXT: movd %xmm0, %r10d -; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi -; CHECK-NEXT: pextrw $3, %xmm0, %eax -; CHECK-NEXT: pextrw $2, %xmm0, %edi -; CHECK-NEXT: movw %r11w, 8(%rdx) -; CHECK-NEXT: movw %cx, 4(%rdx) -; CHECK-NEXT: movw %r8w, 12(%rdx) -; CHECK-NEXT: movw %si, (%rdx) -; CHECK-NEXT: movw %di, 10(%rdx) -; CHECK-NEXT: movw %ax, 14(%rdx) -; CHECK-NEXT: movw %r10w, 2(%rdx) -; CHECK-NEXT: movw %r9w, 6(%rdx) +; CHECK-NEXT: movq (%rsi), %rax +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] +; CHECK-NEXT: movdqa %xmm0, (%rdx) ; CHECK-NEXT: retq %tmp4 = load <4 x half>, <4 x half>* %a %tmp5 = load <4 x half>, <4 x half>* %b diff --git a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll index 7e48f5d4afe1..ab31a2da0d98 100644 --- a/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll +++ b/llvm/test/CodeGen/X86/sse-insertelt-from-mem.ll @@ -379,26 +379,15 @@ define <4 x i32> @insert_i32_two_elts(<4 x i32> %x, i32* %s.addr) { } define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, i64* %s.addr) { -; SSE2-LABEL: insert_i64_two_elts: -; SSE2: # %bb.0: -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq %rax, %xmm0 -; SSE2-NEXT: movq %rax, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: insert_i64_two_elts: -; SSE41: # %bb.0: -; SSE41-NEXT: movq (%rdi), %rax -; SSE41-NEXT: pinsrq $0, %rax, %xmm0 -; SSE41-NEXT: pinsrq $1, %rax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: insert_i64_two_elts: +; SSE: # %bb.0: +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq ; ; AVX-LABEL: insert_i64_two_elts: ; AVX: # %bb.0: -; AVX-NEXT: movq (%rdi), %rax -; AVX-NEXT: vpinsrq $0, %rax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm0 +; AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX-NEXT: retq %s = load i64, i64* %s.addr %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 diff --git a/llvm/test/CodeGen/X86/sse-insertelt.ll b/llvm/test/CodeGen/X86/sse-insertelt.ll index 89253c562a34..34de7e65465d 100644 --- a/llvm/test/CodeGen/X86/sse-insertelt.ll +++ b/llvm/test/CodeGen/X86/sse-insertelt.ll @@ -352,24 +352,11 @@ define <4 x i32> @insert_i32_two_elts(<4 x i32> %x, i32 %s) { } define <2 x i64> @insert_i64_two_elts(<2 x i64> %x, i64 %s) { -; SSE2-LABEL: insert_i64_two_elts: -; SSE2: # %bb.0: -; SSE2-NEXT: movq %rdi, %xmm0 -; SSE2-NEXT: movq %rdi, %xmm1 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE2-NEXT: retq -; -; SSE41-LABEL: insert_i64_two_elts: -; SSE41: # %bb.0: -; SSE41-NEXT: pinsrq $0, %rdi, %xmm0 -; SSE41-NEXT: pinsrq $1, %rdi, %xmm0 -; SSE41-NEXT: retq -; -; AVX-LABEL: insert_i64_two_elts: -; AVX: # %bb.0: -; AVX-NEXT: vpinsrq $0, %rdi, %xmm0, %xmm0 -; AVX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 -; AVX-NEXT: retq +; SSE-LABEL: insert_i64_two_elts: +; SSE: # %bb.0: +; SSE-NEXT: movq %rdi, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: retq %i0 = insertelement <2 x i64> %x, i64 %s, i32 0 %i1 = insertelement <2 x i64> %i0, i64 %s, i32 1 ret <2 x i64> %i1 diff --git a/llvm/test/CodeGen/X86/vec_insert-7.ll b/llvm/test/CodeGen/X86/vec_insert-7.ll index 0d242bf326f1..cea047453de4 100644 --- a/llvm/test/CodeGen/X86/vec_insert-7.ll +++ b/llvm/test/CodeGen/X86/vec_insert-7.ll @@ -8,7 +8,8 @@ define x86_mmx @mmx_movzl(x86_mmx %x) nounwind { ; X86-LABEL: mmx_movzl: ; X86: ## %bb.0: -; X86-NEXT: movq {{\.?LCPI[0-9]+_[0-9]+}}, %mm0 +; X86-NEXT: movl $32, %eax +; X86-NEXT: movd %eax, %mm0 ; X86-NEXT: retl ; ; X64-LABEL: mmx_movzl: