#include #include #include #include /** * gcc -o mm -g -march=armv7 -mfpu=neon-vfpv4 matrix_matrix.c * or cross compile * arm-linux-gnueabihf-gcc -o mm -g -march=armv7 -mfpu=neon-vfpv4 matrix_matrix.c * * test with a = np.arange(4*n*m + 4m).reshape(4*n,4*m) * a.dot(a.T) * * this file contains all the elements to show ARM7VL usage of * SIMD architecture * */ void simple_transpose( float *, int, int, float *); static inline void my_sgemm_4x4(float *, float *, float *, int, int, int ); /** * help routine (only for documentation demonstration * * transpose some matrix matrix_a (size n_rows, n_cols) * to matrix output * */ void simple_transpose( float *matrix_a, int n_rows_a, int n_cols_a, float *output ) { for (int ra = 0; ra < n_rows_a; ra++) for (int ca = 0; ca < n_cols_a; ca++ ) output[n_rows_a*ca+ra] = matrix_a[n_cols_a*ra+ca]; //output[ra][ca] = matrix_a[ca][ra]; return; } /** * Kernal function including the optimization and the * 4 x 4 multiplication of 4 x4 fragments of large column based * matrixes matrix_a and matrix_b * * arguments: * - (float *) matrix_a: square matrix of size 4 x 4, * - (float *) matrix_b: square matrix of size 4 x 4, * - (float *) output: 4 x 4 result (return value) * - (int) off_a,b,o: offset between to elements last element of one row * and 1 element of next row matrix_a, matrix_b and output * * details documented here ![Using_SIMD](/home/eduard/work/wikiwhat/doc/Using_SIMD.md) */ static inline void my_sgemm_4x4(float *matrix_a, float *matrix_b, float *output, int off_a, int off_b, int off_o ) { /** \code */ asm volatile ( "# Start manual code \n\t" "# Matrix Multiplication \n\n\t" /* Maco section */ ".macro mul_col_f32 res_q, col0_d, col1_d\n\t" "vmla.f32 \\res_q, q8, \\col0_d[0]\n\t" "vmla.f32 \\res_q, q9, \\col0_d[1]\n\t" "vmla.f32 \\res_q, q10, \\col1_d[0]\n\t" "vmla.f32 \\res_q, q11, \\col1_d[1]\n\t" ".endm\n\n\t" /* end macro section */ /* load current state of output -> q12 - q15 */ "vld1.32 {q12}, [%6]!\n\t" "add %6, %6, %5\n\t" /* add some offset until start of next row */ "vld1.32 {q13}, [%6]!\n\t" "add %6, %6, %5\n\t" "vld1.32 {q14}, [%6]!\n\t" "add %6, %6, %5\n\t" "vld1.32 {q15}, [%6]!\n\t" /* load matrix_b (transposed!) -> q8 - q11 */ "vld1.32 {q8}, [%2]!\n\t" "add %2, %2, %4\n\t" "vld1.32 {q9}, [%2]!\n\t" "add %2, %2, %4\n\t" "vld1.32 {q10}, [%2]!\n\t" "add %2, %2, %4\n\t" "vld1.32 {q11}, [%2]!\n\t" /* load matrix_a -> q0 - q3 */ "vld1.32 {q0}, [%1]!\n\t" "add %1, %1, %3\n\t" "vld1.32 {q1}, [%1]!\n\t" "add %1,%1, %3\n\t" "vld1.32 {q2}, [%1]!\n\t" "add %1, %1, %3\n\t" "vld1.32 {q3}, [%1]!\n\t" /* end load registers * start doing the actual matrix multiplication as defined in macro */ "mul_col_f32 q12, d0, d1\n\t" "mul_col_f32 q13, d2, d3\n\t" "mul_col_f32 q14, d4, d5\n\t" "mul_col_f32 q15, d6, d7\n\n\t" /* store the result [q12 - 115] into output */ "vst1.32 {q12}, [%0]!\n\t" "add %0, %0, %5\n\t" "vst1.32 {q13}, [%0]!\n\t" "add %0, %0, %5\n\t" "vst1.32 {q14}, [%0]!\n\t" "add %0, %0, %5\n\t" "vst1.32 {q15}, [%0]!\n\t" /* start argument section of inline assembler */ :"+r"((long) output) :"r"(&matrix_a[0]),"r"(&matrix_b[0]),"r"(off_a),"r"(off_b), "r"(off_o),"r"(&output[0])); /** \endcode */ return; } /** * matrix matrix multiplication of some matrix_a and some matrix_b * (works only for size 4*n x 4*m) * the order is column based and output = a x b.transpose() * * the multiplication based on patch-wise standard multiplication algorithm * each patch of size 4 x 4 */ void my_sgemm(float *matrix_a, int n_rows_a, int n_cols_a, float *matrix_b, int n_rows_b, int n_cols_b, float *output ) { int offset_a = 4*(n_cols_a-4); int offset_b = 4*(n_cols_b-4); for(int i=0;i