5. Floating Point

5.19. FP Example : Array Multiplication

X=X+Y*Z

All 32*32 matrices, 64-bit double-precision elements

Untitled

C code

void mm(double x[][], double y[][], double z[][]){
	int i, j, k;
	for (i = 0; i != 32; i = i + 1)
		for (j = 0; j != 32; j = j + 1)
			for (k = 0; k != 32; k = k + 1)
				x[i][j] = x[i][j] + y[i][k] * z[k][j];
}

Compiled MIPS code

		li    $t1,  32         # $t1 = 32 -> li : load immediate
		li    $s0,  0          # i = 0
		li    $s1,  0          # j = 0
		li    $s2,  0          # k = 0
		sll   $t2,  $s0,  5    # $t2 = i * 32 (size of row of x)
		addu  $t2   $t2,  $s1  # $t2 = i * size(row) + j
		sll   $t2,  $t2,  3    # $t2 = byte offset of [i][j]
		addu  $t2,  $a0,  $t2  # $t2 = byet address of x[i][j]
		l.d   $f4,  0($t2)     # $f4 = 8 bytes of x[i][j] -> $f4, $f5에 모두 저장됨
L3: sll   $t0,  $s2,  5    # $t0 = k * 32 (size of row of z)
		addu  $t0,  $t0,  $s1  # $t0 = k * size(row) + j
		sll   $t0,  $t0,  3    # $t0 = byte offset of [k][j]
		addu  $t0,  $a2,  $t0  # $t0 = byte address of z[k][j]
		l.d   $f16, 0($t0)     # $f16 = 8 bytes of z[k][j]
		sll   $t0,  $s0,  5    # $t0 = i * 32 (size of row of y)
		addu  $t0   $t0,  $s2  # $t0 = i * size(row) + k
		sll   $t0,  $t0,  3    # $t0 = byte offset of [i][k]
		addu  $t0,  $a1,  $t0  # $t0 = byte address of y[i][k]
		l.d   $f18, 0($t0)     # $f18 = 8 bytes of y[i][k]
		**mul.d $f16, $f18, $f16 # $f16 = y[i][k] * z[k][j]
		add.d $f4,  $f4,  $f16 # $f4 = x[i][j] + y[i][k] * z[k][j]**
		addiu $s2,  $s2,  1    # $k = k + 1
		bne   $s2,  $t1,  L3   # if (k != 32) go to L3
		**s.d   $f4,  0($t2)     # x[i][j] = $f4**
		addiu $s1,  $s1,  1    # $j = j + 1
		bne   $s1,  $t1,  L2   # if (j != 32) go to L2
		addiu $s0,  $s0,  1    # $i = i + 1
		bne   $s0,  $t1,  L1   # if (i != 32) go to L1

5.20. Accurate Arithmetic

6. Parallelism and Computer Arithmetic : Subword Parallelism

6.1. Subword Parallelism

7. Real Stuff : Streaming SIMD Extensions and AVX in x86

7.1. Streaming SIMD Extension 2(SSE2)

8. Going Faster : Subword Parallelism and Matrix Multiply