VFPU-Int: Add a few fast-paths to ReadMatrix/WriteMatrix. Should gain…

… back part of the speed lost in #12217 (which fixed #5399).
hrydgard · Aug 6, 2019 · c05a2df · c05a2df
1 parent 641bb72
commit c05a2df
Showing 1 changed file with 83 additions and 19 deletions.
diff --git a/Core/MIPS/MIPSVFPUUtils.cpp b/Core/MIPS/MIPSVFPUUtils.cpp
@@ -17,7 +17,8 @@
 
 #include <cstdint>
 #include <limits>
-#include <stdio.h>
+#include <cstdio>
+#include <cstring>
 
 #include "Common/BitScan.h"
 #include "Common/CommonFuncs.h"
@@ -245,14 +246,44 @@ void ReadMatrix(float *rd, MatrixSize size, int reg) {
  default: _assert_msg_(JIT, 0, "%s: Bad matrix size", __FUNCTION__);
  }
 
- for (int i = 0; i < side; i++) {
- for (int j = 0; j < side; j++) {
- int index = mtx * 4;
- if (transpose)
- index += ((row+i)&3) + ((col+j)&3)*32;
- else
- index += ((col+j)&3) + ((row+i)&3)*32;
- rd[j*4 + i] = V(index);
+ // The voffset ordering is now integrated in these formulas,
+ // eliminating a table lookup.
+ const float *v = currentMIPS->v + (size_t)mtx * 16;
+ if (transpose) {
+ if (side == 4 && col == 0 && row == 0) {
+ // Simple 4x4 transpose. TODO: Optimize.
+ for (int j = 0; j < 4; j++) {
+ for (int i = 0; i < 4; i++) {
+ rd[j * 4 + i] = v[i * 4 + j];
+ }
+ }
+ } else {
+ for (int j = 0; j < side; j++) {
+ for (int i = 0; i < side; i++) {
+ int index = ((row + i) & 3) * 4 + ((col + j) & 3);
+ rd[j * 4 + i] = v[index];
+ }
+ }
+ }
+ } else {
+ // Fast path
+ if (side == 4 && col == 0 && row == 0) {
+ memcpy(rd, v, sizeof(float) * 16);
+ /*
+ for (int j = 0; j < side; j++) {
+ for (int i = 0; i < side; i++) {
+ int index = mtx * 16 + j * 4 + i;
+ rd[j * 4 + i] = v[index];
+ }
+ }
+ */
+ } else {
+ for (int j = 0; j < side; j++) {
+ for (int i = 0; i < side; i++) {
+ int index = ((col + j) & 3) * 4 + ((row + i) & 3);
+ rd[j * 4 + i] = v[index];
+ }
+ }
  }
  }
 }
@@ -277,16 +308,49 @@ void WriteMatrix(const float *rd, MatrixSize size, int reg) {
  ERROR_LOG_REPORT(CPU, "Write mask used with vfpu matrix instruction.");
  }
 
- for (int i = 0; i < side; i++) {
- for (int j = 0; j < side; j++) {
- // Hm, I wonder if this should affect matrices at all.
- if (j != side -1 || !currentMIPS->VfpuWriteMask(i)) {
- int index = mtx * 4;
- if (transpose)
- index += ((row+i)&3) + ((col+j)&3)*32;
- else
- index += ((col+j)&3) + ((row+i)&3)*32;
- V(index) = rd[j*4+i];
+ // The voffset ordering is now integrated in these formulas,
+ // eliminating a table lookup.
+ float *v = currentMIPS->v + (size_t)mtx * 16;
+ if (transpose) {
+ // Fast path
+ if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
+ for (int j = 0; j < side; j++) {
+ for (int i = 0; i < side; i++) {
+ int index = i * 4 + j;
+ v[index] = rd[j * 4 + i];
+ }
+ }
+ } else {
+ for (int j = 0; j < side; j++) {
+ for (int i = 0; i < side; i++) {
+ // Hm, I wonder if this should affect matrices at all.
+ if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
+ int index = ((row + i) & 3) * 4 + ((col + j) & 3);
+ v[index] = rd[j * 4 + i];
+ }
+ }
+ }
+ }
+ } else {
+ if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
+ memcpy(v, rd, sizeof(float) * 16);
+ /*
+ for (int j = 0; j < side; j++) {
+ for (int i = 0; i < side; i++) {
+ int index = j * 4 + i;
+ v[index] = rd[j * 4 + i];
+ }
+ }
+ */
+ } else {
+ for (int j = 0; j < side; j++) {
+ for (int i = 0; i < side; i++) {
+ // Hm, I wonder if this should affect matrices at all.
+ if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
+ int index = ((col + j) & 3) * 4 + ((row + i) & 3);
+ v[index] = rd[j * 4 + i];
+ }
+ }
  }
  }
  }