# Motivation
## How to solve matrix-matrix multiplication

In [1]:
import numpy as np
import pandas as pd

Simple approach in C

~~~
 for (c = 0; c < m; c++) {
 for (d = 0; d < q; d++) {
 for (k = 0; k < p; k++) {
 sum = sum + first[c][k]*second[k][d];
 }
 
 multiply[c][d] = sum;
 sum = 0;
 }
 }
~~~

In [2]:
def mymatrixmult(A,B):
 y = np.zeros((A.shape[0], B.shape[1]))
 for i in range(A.shape[0]):
 for j in range(B.shape[1]):
 for k in range(A.shape[0]):
 y[i][j] += A[i][k]*B[k][j]
 return y
 
m = np.arange(40000).reshape(200,200)
m1 = m/np.average(m)

### Takes a while (200**3) = 8 MFLOPS

In [3]:
mymatrixmult(m1, m1.T)

array([[6.61708085e-03, 1.65675784e-02, 2.65180759e-02, ...,
 1.96686509e+00, 1.97681559e+00, 1.98676609e+00],
 [1.65675784e-02, 4.65190759e-02, 7.64705735e-02, ...,
 5.91701260e+00, 5.94696409e+00, 5.97691559e+00],
 [2.65180759e-02, 7.64705735e-02, 1.26423071e-01, ...,
 9.86716010e+00, 9.91711260e+00, 9.96706510e+00],
 ...,
 [1.96686509e+00, 5.91701260e+00, 9.86716010e+00, ...,
 7.80145924e+02, 7.84096071e+02, 7.88046219e+02],
 [1.97681559e+00, 5.94696409e+00, 9.91711260e+00, ...,
 7.84096071e+02, 7.88066220e+02, 7.92036368e+02],
 [1.98676609e+00, 5.97691559e+00, 9.96706510e+00, ...,
 7.88046219e+02, 7.92036368e+02, 7.96026518e+02]])

In [4]:
m1.dot(m1.T)

array([[6.61708085e-03, 1.65675784e-02, 2.65180759e-02, ...,
 1.96686509e+00, 1.97681559e+00, 1.98676609e+00],
 [1.65675784e-02, 4.65190759e-02, 7.64705735e-02, ...,
 5.91701260e+00, 5.94696409e+00, 5.97691559e+00],
 [2.65180759e-02, 7.64705735e-02, 1.26423071e-01, ...,
 9.86716010e+00, 9.91711260e+00, 9.96706510e+00],
 ...,
 [1.96686509e+00, 5.91701260e+00, 9.86716010e+00, ...,
 7.80145924e+02, 7.84096071e+02, 7.88046219e+02],
 [1.97681559e+00, 5.94696409e+00, 9.91711260e+00, ...,
 7.84096071e+02, 7.88066220e+02, 7.92036368e+02],
 [1.98676609e+00, 5.97691559e+00, 9.96706510e+00, ...,
 7.88046219e+02, 7.92036368e+02, 7.96026518e+02]])

### Now numpy: (2000**3) = 8 GFLOPS

In [5]:
M = np.arange(4000000).reshape(2000,2000) 
M1 = M/np.average(M) 

In [7]:
M1.dot(M1.T)

array([[6.66167083e-04, 1.66566758e-03, 2.66516808e-03, ...,
 1.99666867e+00, 1.99766817e+00, 1.99866767e+00],
 [1.66566758e-03, 4.66516908e-03, 7.66467058e-03, ...,
 5.99167016e+00, 5.99466966e+00, 5.99766917e+00],
 [2.66516808e-03, 7.66467058e-03, 1.26641731e-02, ...,
 9.98667166e+00, 9.99167116e+00, 9.99667067e+00],
 ...,
 [1.99666867e+00, 5.99167016e+00, 9.98667166e+00, ...,
 7.98001466e+03, 7.98400966e+03, 7.98800466e+03],
 [1.99766817e+00, 5.99466966e+00, 9.99167116e+00, ...,
 7.98400966e+03, 7.98800666e+03, 7.99200366e+03],
 [1.99866767e+00, 5.99766917e+00, 9.99667067e+00, ...,
 7.98800466e+03, 7.99200366e+03, 7.99600267e+03]])

In [15]:
n = 12
np.arange(n*n).reshape(n,n)

array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 [ 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23],
 [ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
 [ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
 [ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
 [ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
 [ 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83],
 [ 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95],
 [ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107],
 [108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119],
 [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131],
 [132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143]])

## Splitting the matrix

Two reasons:
1. optimize cache usage
2. using SIMD power



## ?

No idea, about the following: $y = tanh(M)$

In [8]:
y = np.tanh(M1.dot(M1.T))

In [24]:
y.shape

(1000, 1000)

In [51]:
((4*128)**3)*16/((128)**3*64)

16.0

# das wichtig