From f01f2d05f396dd4b82693d8ebaff50fc4ef8d410 Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 23 Aug 2021 16:45:26 -0700 Subject: [PATCH 1/8] refactor optimize GEMM on CPU tutorial --- tutorials/optimize/opt_gemm.py | 116 ++++++++++++++++++--------------- 1 file changed, 63 insertions(+), 53 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 7af772784cd6..d3f80e4b531d 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -101,7 +101,7 @@ k = te.reduce_axis((0, K), "k") A = te.placeholder((M, K), name="A") B = te.placeholder((K, N), name="B") -C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C") +C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C") # Default schedule s = te.create_schedule(C.op) @@ -130,15 +130,16 @@ # fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache) bn = 32 +kfactor = 4 s = te.create_schedule(C.op) # Blocking by loop tiling -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) (k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) +ko, ki = s[C].split(k, factor=kfactor) # Hoist reduction domain outside the blocking loop -s[C].reorder(xo, yo, ko, ki, xi, yi) +s[C].reorder(mo, no, ko, ki, mi, ni) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func @@ -167,14 +168,14 @@ # In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly. s = te.create_schedule(C.op) -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) (k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) +ko, ki = s[C].split(k, factor=kfactor) -s[C].reorder(xo, yo, ko, ki, xi, yi) +s[C].reorder(mo, no, ko, ki, mi, ni) # Vectorization -s[C].vectorize(yi) +s[C].vectorize(ni) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func @@ -194,20 +195,19 @@ ################################################################################################### # Loop Permutation # ---------------- -# If we look at the above IR, we can see the inner loop row data is vectorized and -# B is transformed into PackedB. The traversal of PackedB is sequential now. -# So we will look at the access pattern of A. In current schedule, A is accessed column by column -# which is not cache friendly. If we change the nested loop order of ki and inner axes xi, +# If we look at the above IR, we can see the inner loop row data is vectorized for both B and C. +# Next we will look at the access pattern of A. In current schedule, A is accessed column by column +# which is not cache friendly. If we change the nested loop order of ki and inner axes mi, # the access pattern for A matrix is more cache friendly. s = te.create_schedule(C.op) -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) (k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) +ko, ki = s[C].split(k, factor=kfactor) # re-ordering -s[C].reorder(xo, yo, ko, xi, ki, yi) -s[C].vectorize(yi) +s[C].reorder(mo, no, ko, mi, ki, ni) +s[C].vectorize(ni) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func @@ -227,43 +227,47 @@ ################################################################################################### # Array Packing # ------------- -# Another important trick is array packing. This trick is to reorder the storage dimension of the -# array to convert the continuous access pattern on certain dimension to a sequential pattern after +# Another important trick is array packing. This trick is to reorder the storage dimension of an +# array to convert the continuous access pattern on its dimensions to a sequential pattern after # flattening. # # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png # :align: center # +# NOTE: The figure above is meant for illustration purposes only. Please ignore dimension +# information ([16][16] and [16/4][16][4]) which does not apply to this tutorial. ################################################################################################### -# Just as it is shown in the figure above, after blocking the computations, we can observe the array -# access pattern of B (after flattening), which is regular but discontinuous. We expect that after -# some transformation we can get continuous access pattern. We can reorder a [16][16] array to -# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing -# the corresponding value from the packed array. -# +# We can use array packing to address the access pattern for B. Observe the array access pattern of +# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B +# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and +# also the vector size for both B in the inner loop. This reorder splits N into two dimensions --- +# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B +# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after +# flattening. + # We have to re-write the algorithm slightly. -packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB") +packedB = te.compute((N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB") C = te.compute( (M, N), - lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k), + lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k), name="C", ) s = te.create_schedule(C.op) -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) (k,) = s[C].op.reduce_axis -ko, ki = s[C].split(k, factor=4) +ko, ki = s[C].split(k, factor=kfactor) -s[C].reorder(xo, yo, ko, xi, ki, yi) -s[C].vectorize(yi) +s[C].reorder(mo, no, ko, mi, ki, ni) +s[C].vectorize(ni) -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) +bigN, k, littleN = s[packedB].op.axis +s[packedB].vectorize(littleN) +s[packedB].parallel(bigN) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func @@ -293,23 +297,26 @@ # Allocate write cache CC = s.cache_write(C, "global") -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -# Write cache is computed at yo -s[CC].compute_at(s[C], yo) +# Write cache is computed at no +s[CC].compute_at(s[C], no) # New inner axes -xc, yc = s[CC].op.axis +mc, nc = s[CC].op.axis (k,) = s[CC].op.reduce_axis -ko, ki = s[CC].split(k, factor=4) -s[CC].reorder(ko, xc, ki, yc) +ko, ki = s[CC].split(k, factor=kfactor) +s[CC].reorder(ko, mc, ki, nc) +s[CC].vectorize(nc) + +# unroll kfactor loops +# this is a separate optimization not discussed in this tutorial s[CC].unroll(ki) -s[CC].vectorize(yc) -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) +bigN, k, littleN = s[packedB].op.axis +s[packedB].vectorize(littleN) +s[packedB].parallel(bigN) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func @@ -335,24 +342,27 @@ CC = s.cache_write(C, "global") -xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) +mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -s[CC].compute_at(s[C], yo) +s[CC].compute_at(s[C], no) -xc, yc = s[CC].op.axis +mc, nc = s[CC].op.axis (k,) = s[CC].op.reduce_axis -ko, ki = s[CC].split(k, factor=4) -s[CC].reorder(ko, xc, ki, yc) +ko, ki = s[CC].split(k, factor=kfactor) +s[CC].reorder(ko, mc, ki, nc) +s[CC].vectorize(nc) + +# unroll kfactor loops +# this is a separate optimization not discussed in this tutorial s[CC].unroll(ki) -s[CC].vectorize(yc) # parallel -s[C].parallel(xo) +s[C].parallel(mo) -x, y, z = s[packedB].op.axis -s[packedB].vectorize(z) -s[packedB].parallel(x) +bigN, k, littleN = s[packedB].op.axis +s[packedB].vectorize(littleN) +s[packedB].parallel(bigN) func = tvm.build(s, [A, B, C], target=target, name="mmult") assert func From 9961790c1584ad054f8e5d3e62718016305eed01 Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 23 Aug 2021 17:27:40 -0700 Subject: [PATCH 2/8] fix lint errors --- tutorials/optimize/opt_gemm.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index d3f80e4b531d..6ec6c513f1eb 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -163,7 +163,8 @@ # ------------- # Another important trick is vectorization. When the memory access pattern is uniform, # the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM, -# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly. +# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it +# vastly. # # In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly. @@ -234,22 +235,23 @@ # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png # :align: center # -# NOTE: The figure above is meant for illustration purposes only. Please ignore dimension +# NOTE: The figure above is meant for illustration purposes only. Please ignore dimension # information ([16][16] and [16/4][16][4]) which does not apply to this tutorial. ################################################################################################### -# We can use array packing to address the access pattern for B. Observe the array access pattern of -# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B -# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and -# also the vector size for both B in the inner loop. This reorder splits N into two dimensions --- -# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B -# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after +# We can use array packing to address the access pattern for B. Observe the array access pattern of +# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B +# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and +# also the vector size for both B in the inner loop. This reorder splits N into two dimensions --- +# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B +# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after # flattening. # We have to re-write the algorithm slightly. -packedB = te.compute((N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB") +packedB = te.compute((N / bn, K, bn), + lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB") C = te.compute( (M, N), lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k), From c7e3b65877a8bec347248a29f58c0543f82b54db Mon Sep 17 00:00:00 2001 From: adstraw Date: Mon, 23 Aug 2021 17:43:31 -0700 Subject: [PATCH 3/8] fix more lint errors --- tutorials/optimize/opt_gemm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 6ec6c513f1eb..2e24df73e5e1 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -250,8 +250,9 @@ # We have to re-write the algorithm slightly. -packedB = te.compute((N / bn, K, bn), - lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB") +packedB = te.compute( + (N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB" +) C = te.compute( (M, N), lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k), From cbc50593e5cc6ae480d3a768c2719599a91e7538 Mon Sep 17 00:00:00 2001 From: adstraw Date: Tue, 24 Aug 2021 12:15:06 -0700 Subject: [PATCH 4/8] fix typo --- tutorials/optimize/opt_gemm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 2e24df73e5e1..1fc3bd178d66 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -245,7 +245,7 @@ # with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and # also the vector size for both B in the inner loop. This reorder splits N into two dimensions --- # bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B -# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after +# from outer to inner loops (no, ko, ki, ni) resulting in a sequential access pattern for B after # flattening. From 3760ce7ab0bfbe2219cfbfdbb28250dfa6ae4695 Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 26 Aug 2021 08:49:22 -0700 Subject: [PATCH 5/8] fix problem with redefinition of `k` add TODO and comments around loop unrolling clarify note on the array packing figure --- tutorials/optimize/opt_gemm.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 1fc3bd178d66..b58c3c915f7b 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -135,7 +135,6 @@ # Blocking by loop tiling mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis ko, ki = s[C].split(k, factor=kfactor) # Hoist reduction domain outside the blocking loop @@ -170,7 +169,6 @@ s = te.create_schedule(C.op) mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis ko, ki = s[C].split(k, factor=kfactor) s[C].reorder(mo, no, ko, ki, mi, ni) @@ -203,7 +201,6 @@ s = te.create_schedule(C.op) mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis ko, ki = s[C].split(k, factor=kfactor) # re-ordering @@ -235,8 +232,7 @@ # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png # :align: center # -# NOTE: The figure above is meant for illustration purposes only. Please ignore dimension -# information ([16][16] and [16/4][16][4]) which does not apply to this tutorial. +# NOTE: This figure is a general illustration of how array packing works. ################################################################################################### @@ -262,13 +258,12 @@ s = te.create_schedule(C.op) mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -(k,) = s[C].op.reduce_axis ko, ki = s[C].split(k, factor=kfactor) s[C].reorder(mo, no, ko, mi, ki, ni) s[C].vectorize(ni) -bigN, k, littleN = s[packedB].op.axis +bigN, _, littleN = s[packedB].op.axis s[packedB].vectorize(littleN) s[packedB].parallel(bigN) @@ -308,16 +303,17 @@ # New inner axes mc, nc = s[CC].op.axis -(k,) = s[CC].op.reduce_axis ko, ki = s[CC].split(k, factor=kfactor) s[CC].reorder(ko, mc, ki, nc) s[CC].vectorize(nc) +# TODO: Add separate optimization step to discuss loop unrolloing +# unrolling is a loop optimization strategy which can reduce branch +# prediction failures and increases the chance of concurrent execution # unroll kfactor loops -# this is a separate optimization not discussed in this tutorial s[CC].unroll(ki) -bigN, k, littleN = s[packedB].op.axis +bigN, _, littleN = s[packedB].op.axis s[packedB].vectorize(littleN) s[packedB].parallel(bigN) @@ -351,7 +347,6 @@ mc, nc = s[CC].op.axis -(k,) = s[CC].op.reduce_axis ko, ki = s[CC].split(k, factor=kfactor) s[CC].reorder(ko, mc, ki, nc) s[CC].vectorize(nc) @@ -363,7 +358,7 @@ # parallel s[C].parallel(mo) -bigN, k, littleN = s[packedB].op.axis +bigN, _, littleN = s[packedB].op.axis s[packedB].vectorize(littleN) s[packedB].parallel(bigN) From 86f72d81754f002c96896a1f6f70196ede820b6e Mon Sep 17 00:00:00 2001 From: adstraw Date: Thu, 26 Aug 2021 14:21:01 -0700 Subject: [PATCH 6/8] reword general description of array packing --- tutorials/optimize/opt_gemm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index b58c3c915f7b..94856a6af1b5 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -225,9 +225,9 @@ ################################################################################################### # Array Packing # ------------- -# Another important trick is array packing. This trick is to reorder the storage dimension of an -# array to convert the continuous access pattern on its dimensions to a sequential pattern after -# flattening. +# Another important trick is array packing. The trick is to reorder the storage of a multi- +# dimensional array so that it is accessed sequentially after it is flattened and stored in one- +# dimensional memory. # # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png # :align: center @@ -239,7 +239,7 @@ # We can use array packing to address the access pattern for B. Observe the array access pattern of # B after flattening which is not sequential as we iterate over the K dimension. We can reorder B # with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and -# also the vector size for both B in the inner loop. This reorder splits N into two dimensions --- +# also the vector size for B in the inner loop. This reorder splits N into two dimensions --- # bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B # from outer to inner loops (no, ko, ki, ni) resulting in a sequential access pattern for B after # flattening. From d0b680b6204eba96276e7aca5165a97d6f10ef7c Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 27 Aug 2021 13:34:01 -0700 Subject: [PATCH 7/8] grap kaxis from compute definition --- tutorials/optimize/opt_gemm.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 94856a6af1b5..93061f972346 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -135,7 +135,8 @@ # Blocking by loop tiling mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -ko, ki = s[C].split(k, factor=kfactor) +(kaxis,) = s[C].op.reduce_axis +ko, ki = s[C].split(kaxis, factor=kfactor) # Hoist reduction domain outside the blocking loop s[C].reorder(mo, no, ko, ki, mi, ni) @@ -169,7 +170,8 @@ s = te.create_schedule(C.op) mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -ko, ki = s[C].split(k, factor=kfactor) +(kaxis,) = s[C].op.reduce_axis +ko, ki = s[C].split(kaxis, factor=kfactor) s[C].reorder(mo, no, ko, ki, mi, ni) @@ -201,7 +203,8 @@ s = te.create_schedule(C.op) mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -ko, ki = s[C].split(k, factor=kfactor) +(kaxis,) = s[C].op.reduce_axis +ko, ki = s[C].split(kaxis, factor=kfactor) # re-ordering s[C].reorder(mo, no, ko, mi, ki, ni) @@ -258,7 +261,8 @@ s = te.create_schedule(C.op) mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn) -ko, ki = s[C].split(k, factor=kfactor) +(kaxis,) = s[C].op.reduce_axis +ko, ki = s[C].split(kaxis, factor=kfactor) s[C].reorder(mo, no, ko, mi, ki, ni) s[C].vectorize(ni) @@ -303,7 +307,8 @@ # New inner axes mc, nc = s[CC].op.axis -ko, ki = s[CC].split(k, factor=kfactor) +(kaxis,) = s[CC].op.reduce_axis +ko, ki = s[CC].split(kaxis, factor=kfactor) s[CC].reorder(ko, mc, ki, nc) s[CC].vectorize(nc) @@ -347,7 +352,8 @@ mc, nc = s[CC].op.axis -ko, ki = s[CC].split(k, factor=kfactor) +(kaxis,) = s[CC].op.reduce_axis +ko, ki = s[CC].split(kaxis, factor=kfactor) s[CC].reorder(ko, mc, ki, nc) s[CC].vectorize(nc) From fa5ba7de4196a445afe605ce884599a00dc0513e Mon Sep 17 00:00:00 2001 From: adstraw Date: Fri, 27 Aug 2021 13:37:10 -0700 Subject: [PATCH 8/8] remove duplicate comments on unrolling --- tutorials/optimize/opt_gemm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py index 93061f972346..5d698c612ee8 100644 --- a/tutorials/optimize/opt_gemm.py +++ b/tutorials/optimize/opt_gemm.py @@ -356,9 +356,6 @@ ko, ki = s[CC].split(kaxis, factor=kfactor) s[CC].reorder(ko, mc, ki, nc) s[CC].vectorize(nc) - -# unroll kfactor loops -# this is a separate optimization not discussed in this tutorial s[CC].unroll(ki) # parallel