Skip to content

[LoongArch] Optimize inserting extracted elements#146018

Merged
zhaoqi5 merged 7 commits into
mainfrom
users/zhaoqi5/opt-insert-extract-element
Jul 17, 2025
Merged

[LoongArch] Optimize inserting extracted elements#146018
zhaoqi5 merged 7 commits into
mainfrom
users/zhaoqi5/opt-insert-extract-element

Conversation

@zhaoqi5

@zhaoqi5 zhaoqi5 commented Jun 27, 2025

Copy link
Copy Markdown
Contributor

No description provided.

@llvmbot

llvmbot commented Jun 27, 2025

Copy link
Copy Markdown
Member

@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/146018.diff

5 Files Affected:

  • (modified) llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td (+8-5)
  • (modified) llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td (+4-1)
  • (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll (+2-10)
  • (modified) llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll (-4)
  • (modified) llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll (+2-4)
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index ff7b0f2ae3f25..915dc803bdbd7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1593,11 +1593,14 @@ def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
           (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
 def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
           (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;
-
-def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
-          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
-def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
-          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
+def : Pat<(vector_insert v8f32:$xd, (f32 (vector_extract v8f32:$xj, uimm3:$imm1)), uimm3:$imm2),
+          (XVINSGR2VR_W $xd, (XVPICKVE2GR_W v8f32:$xj, uimm3:$imm1), uimm3:$imm2)>;
+def : Pat<(vector_insert v4f64:$xd, (f64 (vector_extract v4f64:$xj, uimm2:$imm1)), uimm2:$imm2),
+          (XVINSGR2VR_D $xd, (XVPICKVE2GR_D v4f64:$xj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v8f32:$xd, FPR32:$fj, uimm3:$imm),
+          (XVINSGR2VR_W $xd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
+def : Pat<(vector_insert v4f64:$xd, FPR64:$fj, uimm2:$imm),
+          (XVINSGR2VR_D $xd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;
 
 // scalar_to_vector
 def : Pat<(v8f32 (scalar_to_vector FPR32:$fj)),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index d73d78083ddcd..34c6ffc6727f1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1791,7 +1791,10 @@ def : Pat<(vector_insert v4i32:$vd, GRLenVT:$rj, uimm2:$imm),
           (VINSGR2VR_W v4i32:$vd, GRLenVT:$rj, uimm2:$imm)>;
 def : Pat<(vector_insert v2i64:$vd, GRLenVT:$rj, uimm1:$imm),
           (VINSGR2VR_D v2i64:$vd, GRLenVT:$rj, uimm1:$imm)>;
-
+def : Pat<(vector_insert v4f32:$vd, (f32 (vector_extract v4f32:$vj, uimm2:$imm1)), uimm2:$imm2),
+          (VINSGR2VR_W $vd, (VPICKVE2GR_W v4f32:$vj, uimm2:$imm1), uimm2:$imm2)>;
+def : Pat<(vector_insert v2f64:$vd, (f64 (vector_extract v2f64:$vj, uimm1:$imm1)), uimm1:$imm2),
+          (VINSGR2VR_D $vd, (VPICKVE2GR_D v2f64:$vj, uimm1:$imm1), uimm1:$imm2)>;
 def : Pat<(vector_insert v4f32:$vd, FPR32:$fj, uimm2:$imm),
           (VINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm2:$imm)>;
 def : Pat<(vector_insert v2f64:$vd, FPR64:$fj, uimm1:$imm),
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
index f3bec11810e9b..f154dd3b8eb3c 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fix-xvshuf.ll
@@ -7,20 +7,12 @@ define <4 x double> @shufflevector_v4f64(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: shufflevector_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 0
-; CHECK-NEXT:    movgr2fr.d $fa2, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
-; CHECK-NEXT:    movgr2fr.d $fa3, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa2
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa3
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 2
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 1
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa0, $a0
-; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa0
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 2
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
+; CHECK-NEXT:    xvpickve2gr.d $a0, $xr1, 3
 ; CHECK-NEXT:    xvinsgr2vr.d $xr2, $a0, 3
 ; CHECK-NEXT:    xvori.b $xr0, $xr2, 0
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
index a5d3a0d395b3c..ddbc159ca94ba 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/insert-extract-element.ll
@@ -5,8 +5,6 @@ define <8 x float> @insert_extract_v8f32(<8 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v8f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.w $a0, $xr0, 7
-; CHECK-NEXT:    movgr2fr.w $fa1, $a0
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.w $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
@@ -19,8 +17,6 @@ define <4 x double> @insert_extract_v4f64(<4 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xvpickve2gr.d $a0, $xr0, 3
-; CHECK-NEXT:    movgr2fr.d $fa1, $a0
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
 ; CHECK-NEXT:    xvinsgr2vr.d $xr0, $a0, 1
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
index dcf23f0240712..4c34e0f49b8c8 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/insert-extract-element.ll
@@ -4,8 +4,7 @@
 define <4 x float> @insert_extract_v4f32(<4 x float> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.w $vr1, $vr0, 3
-; CHECK-NEXT:    movfr2gr.s $a0, $fa1
+; CHECK-NEXT:    vpickve2gr.w $a0, $vr0, 3
 ; CHECK-NEXT:    vinsgr2vr.w $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -17,8 +16,7 @@ entry:
 define <2 x double> @insert_extract_v2f64(<2 x double> %a) nounwind {
 ; CHECK-LABEL: insert_extract_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vreplvei.d $vr1, $vr0, 1
-; CHECK-NEXT:    movfr2gr.d $a0, $fa1
+; CHECK-NEXT:    vpickve2gr.d $a0, $vr0, 1
 ; CHECK-NEXT:    vinsgr2vr.d $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:

@tangaac

tangaac commented Jun 27, 2025

Copy link
Copy Markdown
Member

We could use VEXTRINS instructions instead.

@zhaoqi5

zhaoqi5 commented Jun 27, 2025

Copy link
Copy Markdown
Contributor Author

We could use VEXTRINS instructions instead.

Great, using one vextrins instruction is enough. I will modify it later. Thanks.

@zhaoqi5 zhaoqi5 force-pushed the users/zhaoqi5/opt-insert-extract-element branch from 85cb5e9 to aab3fee Compare July 1, 2025 03:12
@zhaoqi5 zhaoqi5 changed the title [LoongArch] Optimize inserting extracted fp elements [LoongArch] Optimize inserting extracted elements Jul 1, 2025
@tangaac

tangaac commented Jul 2, 2025

Copy link
Copy Markdown
Member
  foreach imm1 = 0...1 in {
    foreach imm2 = 0...1 in {
      defvar Imm = !or(!shl(imm2, 4), imm1);
      def : Pat<(vector_insert(vector_insert v4i64:$xd,
                     (GRLenVT(vector_extract v4i64:$xj, imm1)), imm2),
                    (GRLenVT(vector_extract v4i64:$xj, !add(imm1, 2))),
                    !add(imm2, 2)),
                (XVEXTRINS_D $xd, $xj, Imm)>;
    }
  }

We could also support XVEXTRINS.{W/D} instrunctions.

@tangaac tangaac closed this Jul 2, 2025
@tangaac tangaac reopened this Jul 2, 2025
@zhaoqi5

zhaoqi5 commented Jul 2, 2025

Copy link
Copy Markdown
Contributor Author
  foreach imm1 = 0...1 in {
    foreach imm2 = 0...1 in {
      defvar Imm = !or(!shl(imm2, 4), imm1);
      def : Pat<(vector_insert(vector_insert v4i64:$xd,
                     (GRLenVT(vector_extract v4i64:$xj, imm1)), imm2),
                    (GRLenVT(vector_extract v4i64:$xj, !add(imm1, 2))),
                    !add(imm2, 2)),
                (XVEXTRINS_D $xd, $xj, Imm)>;
    }
  }

We could also support XVEXTRINS.{W/D} instrunctions.

XVEXTRINS operates on two elements on the front and back 128 bits. So two pairs of vector_extract + vector_insert are needed. The current tests cannot be optimized. I will add new tests and support it.

@tangaac

tangaac commented Jul 2, 2025

Copy link
Copy Markdown
Member

Use this patch to support extract i8/i16 type element from hi128 part of 256bit vector.
Please update tests too.
support-extract-i8-i16-type-element-from-hi128-part.txt

@zhaoqi5 zhaoqi5 force-pushed the users/zhaoqi5/opt-insert-extract-element branch from aab3fee to 00a0512 Compare July 2, 2025 10:26
@zhaoqi5

zhaoqi5 commented Jul 2, 2025

Copy link
Copy Markdown
Contributor Author

Use this patch to support extract i8/i16 type element from hi128 part of 256bit vector. Please update tests too. support-extract-i8-i16-type-element-from-hi128-part.txt

Done. Thanks for your efforts on this.

@tangaac

tangaac commented Jul 3, 2025

Copy link
Copy Markdown
Member

This change improves vector_insert for lasx

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;

-->

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSVE0_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSVE0_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;

@zhaoqi5

zhaoqi5 commented Jul 3, 2025

Copy link
Copy Markdown
Contributor Author

This change improves vector_insert for lasx

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;

-->

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSVE0_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSVE0_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;

The purpose of this pr is mainly to optimize inserting extracted elements from 128 or 256 bits vector. I think we can do this optimization for vector_insert in a later patch.

@tangaac

tangaac commented Jul 3, 2025

Copy link
Copy Markdown
Member

This change improves vector_insert for lasx

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSGR2VR_W $vd, (COPY_TO_REGCLASS FPR32:$fj, GPR), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSGR2VR_D $vd, (COPY_TO_REGCLASS FPR64:$fj, GPR), uimm2:$imm)>;

-->

// XVINSGR2VR_{W/D}
def : Pat<(vector_insert v8i32:$xd, GRLenVT:$rj, uimm3:$imm),
          (XVINSGR2VR_W v8i32:$xd, GRLenVT:$rj, uimm3:$imm)>;
def : Pat<(vector_insert v4i64:$xd, GRLenVT:$rj, uimm2:$imm),
          (XVINSGR2VR_D v4i64:$xd, GRLenVT:$rj, uimm2:$imm)>;

// XVINSVE0_{W/D}
def : Pat<(vector_insert v8f32:$vd, FPR32:$fj, uimm3:$imm),
          (XVINSVE0_W $vd, (SUBREG_TO_REG (i64 0), FPR32:$fj, sub_32), uimm3:$imm)>;
def : Pat<(vector_insert v4f64:$vd, FPR64:$fj, uimm2:$imm),
          (XVINSVE0_D $vd, (SUBREG_TO_REG (i64 0), FPR64:$fj, sub_64), uimm2:$imm)>;

The purpose of this pr is mainly to optimize inserting extracted elements from 128 or 256 bits vector. I think we can do this optimization for vector_insert in a later patch.

OK

@zhaoqi5 zhaoqi5 requested review from SixWeining and heiher July 14, 2025 10:56
@zhaoqi5 zhaoqi5 merged commit d218011 into main Jul 17, 2025
9 checks passed
@zhaoqi5 zhaoqi5 deleted the users/zhaoqi5/opt-insert-extract-element branch July 17, 2025 07:44
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants