[RISCV][NFC] Add test case for SLP reduction vectorization failure

Horizontal reductions still occur on RISC-V, despite the maximum SLP VF reported back by TTI being 1, to disable SLP. This can cause the cost model to think it can vectorize a gather into smaller, widened loads, when it will actually fail to do so. This should ultimately be fixed whenever SLP is re-enabled for RISC-V at some point. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D146529
2023-03-21 14:46:15 +00:00 · 2023-03-21 14:46:15 +00:00 · e69f8bac42
commit e69f8bac42
parent 67852bff58
1 changed files with 92 additions and 0 deletions
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@ -821,4 +821,96 @@ entry:
  ret i64 %add.15
 }

+declare i32 @llvm.abs.i32(i32, i1)

+; FIXME: This horizontal reduction occurs because the cost model thinks it can
+; vectorize the loads here. However, because -riscv-v-slp-max-vf is set to 1 by
+; default, tryToVectorizeList fails and we end up with this very expensive
+; scalarized load.
+;
+; This is the code the cost model thinks it's going to generate, which you can
+; get by passing -riscv-v-slp-max-vf=0
+;
+; define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) #0 {
+;   %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride
+;   %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride
+;   %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1
+;   %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1
+;   %1 = load <2 x i32>, ptr %p, align 4
+;   %2 = load <2 x i32>, ptr %q, align 4
+;   %x.2 = load i32, ptr %p.2, align 4
+;   %y.2 = load i32, ptr %q.2, align 4
+;   %x.3 = load i32, ptr %p.3, align 4
+;   %y.3 = load i32, ptr %q.3, align 4
+;   %3 = shufflevector <2 x i32> %1, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+;   %4 = insertelement <4 x i32> %3, i32 %x.2, i32 2
+;   %5 = insertelement <4 x i32> %4, i32 %x.3, i32 3
+;   %6 = shufflevector <2 x i32> %2, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+;   %7 = insertelement <4 x i32> %6, i32 %y.2, i32 2
+;   %8 = insertelement <4 x i32> %7, i32 %y.3, i32 3
+;   %9 = sub <4 x i32> %5, %8
+;   %10 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %9, i1 true)
+;   %11 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %10)
+;   ret i32 %11
+; }
+define i32 @stride_sum_abs_diff(ptr %p, ptr %q, i64 %stride) {
+; CHECK-LABEL: @stride_sum_abs_diff(
+; CHECK-NEXT:    [[P_1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[Q_1:%.*]] = getelementptr inbounds i32, ptr [[Q:%.*]], i64 1
+; CHECK-NEXT:    [[P_2:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[STRIDE:%.*]]
+; CHECK-NEXT:    [[Q_2:%.*]] = getelementptr inbounds i32, ptr [[Q]], i64 [[STRIDE]]
+; CHECK-NEXT:    [[P_3:%.*]] = getelementptr inbounds i32, ptr [[P_2]], i64 1
+; CHECK-NEXT:    [[Q_3:%.*]] = getelementptr inbounds i32, ptr [[Q_2]], i64 1
+; CHECK-NEXT:    [[X_0:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT:    [[Y_0:%.*]] = load i32, ptr [[Q]], align 4
+; CHECK-NEXT:    [[X_1:%.*]] = load i32, ptr [[P_1]], align 4
+; CHECK-NEXT:    [[Y_1:%.*]] = load i32, ptr [[Q_1]], align 4
+; CHECK-NEXT:    [[X_2:%.*]] = load i32, ptr [[P_2]], align 4
+; CHECK-NEXT:    [[Y_2:%.*]] = load i32, ptr [[Q_2]], align 4
+; CHECK-NEXT:    [[X_3:%.*]] = load i32, ptr [[P_3]], align 4
+; CHECK-NEXT:    [[Y_3:%.*]] = load i32, ptr [[Q_3]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X_0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X_1]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[X_2]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[X_3]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[Y_0]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[Y_1]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[Y_2]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[Y_3]], i32 3
+; CHECK-NEXT:    [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.abs.v4i32(<4 x i32> [[TMP9]], i1 true)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %x.0 = load i32, ptr %p
+  %y.0 = load i32, ptr %q
+  %sub.0 = sub i32 %x.0, %y.0
+  %abs.0 = tail call i32 @llvm.abs.i32(i32 %sub.0, i1 true)
+
+  %p.1 = getelementptr inbounds i32, ptr %p, i64 1
+  %x.1 = load i32, ptr %p.1
+  %q.1 = getelementptr inbounds i32, ptr %q, i64 1
+  %y.1 = load i32, ptr %q.1
+  %sub.1 = sub i32 %x.1, %y.1
+  %abs.1 = tail call i32 @llvm.abs.i32(i32 %sub.1, i1 true)
+  %sum.0 = add i32 %abs.0, %abs.1
+
+  %p.2 = getelementptr inbounds i32, ptr %p, i64 %stride
+  %q.2 = getelementptr inbounds i32, ptr %q, i64 %stride
+
+  %x.2 = load i32, ptr %p.2
+  %y.2 = load i32, ptr %q.2
+  %sub.2 = sub i32 %x.2, %y.2
+  %abs.2 = tail call i32 @llvm.abs.i32(i32 %sub.2, i1 true)
+  %sum.1 = add i32 %sum.0, %abs.2
+
+  %p.3 = getelementptr inbounds i32, ptr %p.2, i64 1
+  %x.3 = load i32, ptr %p.3
+  %q.3 = getelementptr inbounds i32, ptr %q.2, i64 1
+  %y.3 = load i32, ptr %q.3
+  %sub.3 = sub i32 %x.3, %y.3
+  %abs.3 = tail call i32 @llvm.abs.i32(i32 %sub.3, i1 true)
+  %sum.2 = add i32 %sum.1, %abs.3
+
+  ret i32 %sum.2
+}