-
Notifications
You must be signed in to change notification settings - Fork 15.9k
[AMDGPU] Ensure v_mfma_scale_f32_{16x16x128|32x32x64}_f8f6f4 instructions are convergent #178627
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ions are convergent The scaled variants of mfma instructions are not properly marked as "convergent" and hence the machine-sink pass sinks them which is incorrect. This patch ensures that the instructions get marked as "convergent". The new test also covers other mfma variants, but of those only the scale variants are mistreated without the changes from this patch: $ llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic \ -run-pass=machine-sink mfma-convergent.mir \ -debug-only=machine-sink |& grep "Sink instr.*MFMA" Sink instr [...] V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 [...] Sink instr [...] V_MFMA_SCALE_F32_32X32X64_F8F6F4_f4_f4_mac_vgprcd_e64 [...]
59a61cd to
9d05c22
Compare
|
@llvm/pr-subscribers-backend-amdgpu Author: Frederik Harwath (frederik-h) ChangesThe scaled variants of mfma instructions are not properly marked as "convergent" and hence the machine-sink pass sinks them which is incorrect. This patch ensures that the instructions get marked as "convergent". The new test also covers other mfma variants, but only the scale variants are mistreated without the changes from this patch: Full diff: https://github.com/llvm/llvm-project/pull/178627.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index ca7dfa734e94d..a843cd23fcc62 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -996,6 +996,7 @@ class MAIInst<string OpName, VOPProfile P, SDPatternOperator node, bit Scaled =
Instruction Opcode = !cast<Instruction>(NAME);
bit is_dgemm = 0;
bit is_gfx940_xdl = 0;
+ let isConvergent = 1;
let PseudoInstr = NAME; // FIXME: Why is this not the default
}
@@ -1033,7 +1034,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
defvar ProfileVGPR = !cast<VOPProfileMAI>("VOPProfileMAI_" # P # "_VCD");
- let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
+ let mayRaiseFPException = 0, ReadsModeReg = 1 in {
// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
let Constraints = !if(NoDstOverlap, "@earlyclobber $vdst", "") in {
def _e64 : MAIInst<OpName, ProfileAGPR,
@@ -1060,7 +1061,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node = null_frag,
MFMATable<1, "VGPR", NAME # "_vgprcd_e64", NAME # "_mac_e64">;
}
}
- } // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
+ } // mayRaiseFPException = 0, ReadsModeReg = 1
}
// Provide a wrapper around MAIInst that provides the appended operands from V_MFMA_LD_SCALE_B32
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-convergent.mir b/llvm/test/CodeGen/AMDGPU/mfma-convergent.mir
new file mode 100644
index 0000000000000..6b5f779f6441f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mfma-convergent.mir
@@ -0,0 +1,236 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx9-generic -run-pass=machine-sink %s -stats 2>&1 -o - | \
+# RUN: not grep "Number of machine instructions sunk"
+# machine-sink must not sink MFMA instructions.
+# Ensure that MFMA instructions are marked as convergent to prevent
+# machine-sink from sinking them.
+
+---
+name: test_V_MFMA_F32_32X32X64_F8F6F4_f4_f4_e64
+body: |
+ bb.0:
+ %vsrc0:av_128_align2 = IMPLICIT_DEF
+ %vsrc1:av_128_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc2:areg_512 = IMPLICIT_DEF
+ %vdst:areg_512 = nofpexcept V_MFMA_F32_32X32X64_F8F6F4_f4_f4_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F32_16X16X128_F8F6F4_f4_f4_e64
+body: |
+ bb.0:
+ %vsrc0:av_128_align2 = IMPLICIT_DEF
+ %vsrc1:av_128_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc2:areg_128 = IMPLICIT_DEF
+ %vdst:areg_128 = nofpexcept V_MFMA_F32_16X16X128_F8F6F4_f4_f4_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64
+body: |
+ bb.0:
+ %vsrc0:av_128_align2 = IMPLICIT_DEF
+ %vsrc1:av_128_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc2:vreg_128_align2 = IMPLICIT_DEF
+ %scale_vsrc0:vgpr_32 = IMPLICIT_DEF
+ %scale_vsrc2:vgpr_32 = IMPLICIT_DEF
+ %vdst:vreg_128_align2 = nofpexcept V_MFMA_SCALE_F32_16X16X128_F8F6F4_f4_f4_vgprcd_e64 %vsrc0, %vsrc1, %vsrc2, 4, 4, %scale_vsrc0, %scale_vsrc2, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_SCALE_F32_32X32X64_F8F6F4_f4_f4_mac_vgprcd_e64
+body: |
+ bb.0:
+ %vsrc0:av_128_align2 = IMPLICIT_DEF
+ %vsrc1:av_128_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc2:vreg_512_align2 = IMPLICIT_DEF
+ %scale_vsrc0:vgpr_32 = IMPLICIT_DEF
+ %scale_vsrc2:vgpr_32 = IMPLICIT_DEF
+ %vdst:vreg_512_align2 = nofpexcept V_MFMA_SCALE_F32_32X32X64_F8F6F4_f4_f4_mac_vgprcd_e64 %vsrc0, %vsrc1, %vsrc2, 4, 4, %scale_vsrc0, %scale_vsrc2, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2:
+ S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F32_4X4X1F32_e64
+body: |
+ bb.0:
+ %vsrc0:vgpr_32 = IMPLICIT_DEF
+ %vsrc1:vgpr_32 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vgpr_32 = IMPLICIT_DEF
+ %vsrc2:areg_128 = IMPLICIT_DEF
+ %vdst:areg_128 = V_MFMA_F32_4X4X1F32_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F32_32X32X1F32_e64
+body: |
+ bb.0:
+ %vsrc0:vgpr_32 = IMPLICIT_DEF
+ %vsrc1:vgpr_32 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vgpr_32 = IMPLICIT_DEF
+ %vsrc2:areg_1024 = IMPLICIT_DEF
+ %vdst:areg_1024 = V_MFMA_F32_32X32X1F32_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F32_32X32X8F16_mac_e64
+body: |
+ bb.0:
+ %vsrc0:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc2:areg_512_align2 = IMPLICIT_DEF
+ %vdst:areg_512_align2 = V_MFMA_F32_32X32X8F16_mac_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F64_4X4X4F64_e64
+body: |
+ bb.0:
+ %vsrc0:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc2:areg_64_align2 = IMPLICIT_DEF
+ %vdst:areg_64_align2 = V_MFMA_F64_4X4X4F64_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F32_32X32X16_F16_e64
+body: |
+ bb.0:
+ %vsrc0:vreg_128_align2 = IMPLICIT_DEF
+ %vsrc1:vreg_128_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vreg_128_align2 = IMPLICIT_DEF
+ %vsrc2:areg_512_align2 = IMPLICIT_DEF
+ %vdst:areg_512_align2 = V_MFMA_F32_32X32X16_F16_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F32_32X32X16_BF8_BF8_e64
+body: |
+ bb.0:
+ %vsrc0:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc2:areg_512_align2 = IMPLICIT_DEF
+ %vdst:areg_512_align2 = V_MFMA_F32_32X32X16_BF8_BF8_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_F32_16X16X16F16_e64
+body: |
+ bb.0:
+ %vsrc0:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vreg_64_align2 = IMPLICIT_DEF
+ %vsrc2:areg_128_align2 = IMPLICIT_DEF
+ %vdst:areg_128_align2 = V_MFMA_F32_16X16X16F16_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
+
+---
+name: test_V_MFMA_I32_32X32X8I8_e64
+body: |
+ bb.0:
+ %vsrc0:vgpr_32 = IMPLICIT_DEF
+ %vsrc1:vgpr_32 = IMPLICIT_DEF
+ %ssrc:sreg_64 = IMPLICIT_DEF
+ %vsrc1:vgpr_32 = IMPLICIT_DEF
+ %vsrc2:areg_512_align2 = IMPLICIT_DEF
+ %vdst:areg_512_align2 = V_MFMA_I32_32X32X8I8_e64 %vsrc0, %vsrc1, %vsrc2, 0, 0, 0, implicit $mode, implicit $exec
+ %sdst:sreg_64 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ S_BRANCH %bb.2
+
+ bb.2: S_ENDPGM 0
+...
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added tests for other kinds of MFMAs since I did not find a test which already covers the machine-sink/"convergent" behavior. The number of instructions feels a bit overdone. Should I reduce the test?
🐧 Linux x64 Test Results
Failed Tests(click on a test name to see its output) lldb-apilldb-api.tools/lldb-dap/exception/runtime-instruments/TestDAP_runtime_instruments.pyIf these failures are unrelated to your changes (for example tests are broken or flaky at HEAD), please open an issue at https://github.com/llvm/llvm-project/issues and add the |
The scaled variants of mfma instructions are not properly marked as "convergent" and hence the machine-sink pass sinks them which is incorrect.
This patch ensures that the instructions get marked as "convergent". The new test also covers other mfma variants, but only the scale variants are mistreated without the changes from this patch: