From f01f2d05f396dd4b82693d8ebaff50fc4ef8d410 Mon Sep 17 00:00:00 2001
From: adstraw <adam.straw@gmail.com>
Date: Mon, 23 Aug 2021 16:45:26 -0700
Subject: [PATCH 1/8] refactor optimize GEMM on CPU tutorial

---
 tutorials/optimize/opt_gemm.py | 116 ++++++++++++++++++---------------
 1 file changed, 63 insertions(+), 53 deletions(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 7af772784cd6..d3f80e4b531d 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -101,7 +101,7 @@
 k = te.reduce_axis((0, K), "k")
 A = te.placeholder((M, K), name="A")
 B = te.placeholder((K, N), name="B")
-C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")
+C = te.compute((M, N), lambda m, n: te.sum(A[m, k] * B[k, n], axis=k), name="C")
 
 # Default schedule
 s = te.create_schedule(C.op)
@@ -130,15 +130,16 @@
 # fill 32 * 32 * sizeof(float) which is 4KB in the cache whose total size is 32KB (L1 data cache)
 
 bn = 32
+kfactor = 4
 s = te.create_schedule(C.op)
 
 # Blocking by loop tiling
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 (k,) = s[C].op.reduce_axis
-ko, ki = s[C].split(k, factor=4)
+ko, ki = s[C].split(k, factor=kfactor)
 
 # Hoist reduction domain outside the blocking loop
-s[C].reorder(xo, yo, ko, ki, xi, yi)
+s[C].reorder(mo, no, ko, ki, mi, ni)
 
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
@@ -167,14 +168,14 @@
 # In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly.
 
 s = te.create_schedule(C.op)
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 (k,) = s[C].op.reduce_axis
-ko, ki = s[C].split(k, factor=4)
+ko, ki = s[C].split(k, factor=kfactor)
 
-s[C].reorder(xo, yo, ko, ki, xi, yi)
+s[C].reorder(mo, no, ko, ki, mi, ni)
 
 # Vectorization
-s[C].vectorize(yi)
+s[C].vectorize(ni)
 
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
@@ -194,20 +195,19 @@
 ###################################################################################################
 # Loop Permutation
 # ----------------
-# If we look at the above IR, we can see the inner loop row data is vectorized and
-# B is transformed into PackedB. The traversal of PackedB is sequential now.
-# So we will look at the access pattern of A. In current schedule, A is accessed column by column
-# which is not cache friendly. If we change the nested loop order of ki and inner axes xi,
+# If we look at the above IR, we can see the inner loop row data is vectorized for both B and C.
+# Next we will look at the access pattern of A. In current schedule, A is accessed column by column
+# which is not cache friendly. If we change the nested loop order of ki and inner axes mi,
 # the access pattern for A matrix is more cache friendly.
 
 s = te.create_schedule(C.op)
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 (k,) = s[C].op.reduce_axis
-ko, ki = s[C].split(k, factor=4)
+ko, ki = s[C].split(k, factor=kfactor)
 
 # re-ordering
-s[C].reorder(xo, yo, ko, xi, ki, yi)
-s[C].vectorize(yi)
+s[C].reorder(mo, no, ko, mi, ki, ni)
+s[C].vectorize(ni)
 
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
@@ -227,43 +227,47 @@
 ###################################################################################################
 # Array Packing
 # -------------
-# Another important trick is array packing. This trick is to reorder the storage dimension of the
-# array to convert the continuous access pattern on certain dimension to a sequential pattern after
+# Another important trick is array packing. This trick is to reorder the storage dimension of an
+# array to convert the continuous access pattern on its dimensions to a sequential pattern after
 # flattening.
 #
 # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png
 #      :align: center
 #
+# NOTE: The figure above is meant for illustration purposes only.  Please ignore dimension 
+# information ([16][16] and [16/4][16][4]) which does not apply to this tutorial.
 
 
 ###################################################################################################
-# Just as it is shown in the figure above, after blocking the computations, we can observe the array
-# access pattern of B (after flattening), which is regular but discontinuous. We expect that after
-# some transformation we can get continuous access pattern. We can reorder a [16][16] array to
-# a [16/4][16][4] array, so that the access pattern of B will be sequential when grabing
-# the corresponding value from the packed array.
-#
+# We can use array packing to address the access pattern for B. Observe the array access pattern of 
+# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B 
+# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and 
+# also the vector size for both B in the inner loop.  This reorder splits N into two dimensions --- 
+# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B 
+# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after 
+# flattening.
+
 
 # We have to re-write the algorithm slightly.
-packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB")
+packedB = te.compute((N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB")
 C = te.compute(
     (M, N),
-    lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
+    lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k),
     name="C",
 )
 
 s = te.create_schedule(C.op)
 
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 (k,) = s[C].op.reduce_axis
-ko, ki = s[C].split(k, factor=4)
+ko, ki = s[C].split(k, factor=kfactor)
 
-s[C].reorder(xo, yo, ko, xi, ki, yi)
-s[C].vectorize(yi)
+s[C].reorder(mo, no, ko, mi, ki, ni)
+s[C].vectorize(ni)
 
-x, y, z = s[packedB].op.axis
-s[packedB].vectorize(z)
-s[packedB].parallel(x)
+bigN, k, littleN = s[packedB].op.axis
+s[packedB].vectorize(littleN)
+s[packedB].parallel(bigN)
 
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
@@ -293,23 +297,26 @@
 # Allocate write cache
 CC = s.cache_write(C, "global")
 
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 
-# Write cache is computed at yo
-s[CC].compute_at(s[C], yo)
+# Write cache is computed at no
+s[CC].compute_at(s[C], no)
 
 # New inner axes
-xc, yc = s[CC].op.axis
+mc, nc = s[CC].op.axis
 
 (k,) = s[CC].op.reduce_axis
-ko, ki = s[CC].split(k, factor=4)
-s[CC].reorder(ko, xc, ki, yc)
+ko, ki = s[CC].split(k, factor=kfactor)
+s[CC].reorder(ko, mc, ki, nc)
+s[CC].vectorize(nc)
+
+# unroll kfactor loops
+# this is a separate optimization not discussed in this tutorial
 s[CC].unroll(ki)
-s[CC].vectorize(yc)
 
-x, y, z = s[packedB].op.axis
-s[packedB].vectorize(z)
-s[packedB].parallel(x)
+bigN, k, littleN = s[packedB].op.axis
+s[packedB].vectorize(littleN)
+s[packedB].parallel(bigN)
 
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func
@@ -335,24 +342,27 @@
 
 CC = s.cache_write(C, "global")
 
-xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
+mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 
-s[CC].compute_at(s[C], yo)
+s[CC].compute_at(s[C], no)
 
-xc, yc = s[CC].op.axis
+mc, nc = s[CC].op.axis
 
 (k,) = s[CC].op.reduce_axis
-ko, ki = s[CC].split(k, factor=4)
-s[CC].reorder(ko, xc, ki, yc)
+ko, ki = s[CC].split(k, factor=kfactor)
+s[CC].reorder(ko, mc, ki, nc)
+s[CC].vectorize(nc)
+
+# unroll kfactor loops
+# this is a separate optimization not discussed in this tutorial
 s[CC].unroll(ki)
-s[CC].vectorize(yc)
 
 # parallel
-s[C].parallel(xo)
+s[C].parallel(mo)
 
-x, y, z = s[packedB].op.axis
-s[packedB].vectorize(z)
-s[packedB].parallel(x)
+bigN, k, littleN = s[packedB].op.axis
+s[packedB].vectorize(littleN)
+s[packedB].parallel(bigN)
 
 func = tvm.build(s, [A, B, C], target=target, name="mmult")
 assert func

From 9961790c1584ad054f8e5d3e62718016305eed01 Mon Sep 17 00:00:00 2001
From: adstraw <adam.straw@gmail.com>
Date: Mon, 23 Aug 2021 17:27:40 -0700
Subject: [PATCH 2/8] fix lint errors

---
 tutorials/optimize/opt_gemm.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index d3f80e4b531d..6ec6c513f1eb 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -163,7 +163,8 @@
 # -------------
 # Another important trick is vectorization. When the memory access pattern is uniform,
 # the compiler can detect this pattern and pass the continuous memory to vector processor. In TVM,
-# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it vastly.
+# we can use `vectorize` interface to hint the compiler this pattern, so that we can accelerate it
+# vastly.
 #
 # In this tutorial, we chose to vectorize the inner loop row data since it is cache friendly.
 
@@ -234,22 +235,23 @@
 # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png
 #      :align: center
 #
-# NOTE: The figure above is meant for illustration purposes only.  Please ignore dimension 
+# NOTE: The figure above is meant for illustration purposes only.  Please ignore dimension
 # information ([16][16] and [16/4][16][4]) which does not apply to this tutorial.
 
 
 ###################################################################################################
-# We can use array packing to address the access pattern for B. Observe the array access pattern of 
-# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B 
-# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and 
-# also the vector size for both B in the inner loop.  This reorder splits N into two dimensions --- 
-# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B 
-# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after 
+# We can use array packing to address the access pattern for B. Observe the array access pattern of
+# B after flattening which is not sequential as we iterate over the K dimension. We can reorder B
+# with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and
+# also the vector size for both B in the inner loop.  This reorder splits N into two dimensions ---
+# bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B
+# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after
 # flattening.
 
 
 # We have to re-write the algorithm slightly.
-packedB = te.compute((N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB")
+packedB = te.compute((N / bn, K, bn),
+                     lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB")
 C = te.compute(
     (M, N),
     lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k),

From c7e3b65877a8bec347248a29f58c0543f82b54db Mon Sep 17 00:00:00 2001
From: adstraw <adam.straw@gmail.com>
Date: Mon, 23 Aug 2021 17:43:31 -0700
Subject: [PATCH 3/8] fix more lint errors

---
 tutorials/optimize/opt_gemm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 6ec6c513f1eb..2e24df73e5e1 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -250,8 +250,9 @@
 
 
 # We have to re-write the algorithm slightly.
-packedB = te.compute((N / bn, K, bn),
-                     lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB")
+packedB = te.compute(
+    (N / bn, K, bn), lambda bigN, k, littleN: B[k, bigN * bn + littleN], name="packedB"
+)
 C = te.compute(
     (M, N),
     lambda m, n: te.sum(A[m, k] * packedB[n // bn, k, tvm.tir.indexmod(n, bn)], axis=k),

From cbc50593e5cc6ae480d3a768c2719599a91e7538 Mon Sep 17 00:00:00 2001
From: adstraw <adam.straw@gmail.com>
Date: Tue, 24 Aug 2021 12:15:06 -0700
Subject: [PATCH 4/8] fix typo

---
 tutorials/optimize/opt_gemm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 2e24df73e5e1..1fc3bd178d66 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -245,7 +245,7 @@
 # with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and
 # also the vector size for both B in the inner loop.  This reorder splits N into two dimensions ---
 # bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B
-# from outer to inner loops (no, ko, ki, ni) resulting in a sequentail access pattern for B after
+# from outer to inner loops (no, ko, ki, ni) resulting in a sequential access pattern for B after
 # flattening.
 
 

From 3760ce7ab0bfbe2219cfbfdbb28250dfa6ae4695 Mon Sep 17 00:00:00 2001
From: adstraw <adam.straw@gmail.com>
Date: Thu, 26 Aug 2021 08:49:22 -0700
Subject: [PATCH 5/8] fix problem with redefinition of `k` add TODO and
 comments around loop unrolling clarify note on the array packing figure

---
 tutorials/optimize/opt_gemm.py | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 1fc3bd178d66..b58c3c915f7b 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -135,7 +135,6 @@
 
 # Blocking by loop tiling
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=kfactor)
 
 # Hoist reduction domain outside the blocking loop
@@ -170,7 +169,6 @@
 
 s = te.create_schedule(C.op)
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=kfactor)
 
 s[C].reorder(mo, no, ko, ki, mi, ni)
@@ -203,7 +201,6 @@
 
 s = te.create_schedule(C.op)
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=kfactor)
 
 # re-ordering
@@ -235,8 +232,7 @@
 # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png
 #      :align: center
 #
-# NOTE: The figure above is meant for illustration purposes only.  Please ignore dimension
-# information ([16][16] and [16/4][16][4]) which does not apply to this tutorial.
+# NOTE: This figure is a general illustration of how array packing works.
 
 
 ###################################################################################################
@@ -262,13 +258,12 @@
 s = te.create_schedule(C.op)
 
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-(k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=kfactor)
 
 s[C].reorder(mo, no, ko, mi, ki, ni)
 s[C].vectorize(ni)
 
-bigN, k, littleN = s[packedB].op.axis
+bigN, _, littleN = s[packedB].op.axis
 s[packedB].vectorize(littleN)
 s[packedB].parallel(bigN)
 
@@ -308,16 +303,17 @@
 # New inner axes
 mc, nc = s[CC].op.axis
 
-(k,) = s[CC].op.reduce_axis
 ko, ki = s[CC].split(k, factor=kfactor)
 s[CC].reorder(ko, mc, ki, nc)
 s[CC].vectorize(nc)
 
+# TODO: Add separate optimization step to discuss loop unrolloing
+# unrolling is a loop optimization strategy which can reduce branch
+# prediction failures and increases the chance of concurrent execution
 # unroll kfactor loops
-# this is a separate optimization not discussed in this tutorial
 s[CC].unroll(ki)
 
-bigN, k, littleN = s[packedB].op.axis
+bigN, _, littleN = s[packedB].op.axis
 s[packedB].vectorize(littleN)
 s[packedB].parallel(bigN)
 
@@ -351,7 +347,6 @@
 
 mc, nc = s[CC].op.axis
 
-(k,) = s[CC].op.reduce_axis
 ko, ki = s[CC].split(k, factor=kfactor)
 s[CC].reorder(ko, mc, ki, nc)
 s[CC].vectorize(nc)
@@ -363,7 +358,7 @@
 # parallel
 s[C].parallel(mo)
 
-bigN, k, littleN = s[packedB].op.axis
+bigN, _, littleN = s[packedB].op.axis
 s[packedB].vectorize(littleN)
 s[packedB].parallel(bigN)
 

From 86f72d81754f002c96896a1f6f70196ede820b6e Mon Sep 17 00:00:00 2001
From: adstraw <adam.straw@gmail.com>
Date: Thu, 26 Aug 2021 14:21:01 -0700
Subject: [PATCH 6/8] reword general description of array packing

---
 tutorials/optimize/opt_gemm.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index b58c3c915f7b..94856a6af1b5 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -225,9 +225,9 @@
 ###################################################################################################
 # Array Packing
 # -------------
-# Another important trick is array packing. This trick is to reorder the storage dimension of an
-# array to convert the continuous access pattern on its dimensions to a sequential pattern after
-# flattening.
+# Another important trick is array packing. The trick is to reorder the storage of a multi-
+# dimensional array so that it is accessed sequentially after it is flattened and stored in one-
+# dimensional memory.
 #
 # .. image:: https://github.com/dmlc/web-data/raw/main/tvm/tutorial/array-packing.png
 #      :align: center
@@ -239,7 +239,7 @@
 # We can use array packing to address the access pattern for B. Observe the array access pattern of
 # B after flattening which is not sequential as we iterate over the K dimension. We can reorder B
 # with dimensions [K][N] so that it has dimensions [N/bn][K][bn] where bn is the blocking factor and
-# also the vector size for both B in the inner loop.  This reorder splits N into two dimensions ---
+# also the vector size for B in the inner loop.  This reorder splits N into two dimensions ---
 # bigN (N/bn) and littleN (bn) --- and the new dimensions [N/bn][K][bn] match the indexing of B
 # from outer to inner loops (no, ko, ki, ni) resulting in a sequential access pattern for B after
 # flattening.

From d0b680b6204eba96276e7aca5165a97d6f10ef7c Mon Sep 17 00:00:00 2001
From: adstraw <astraw@octoml.ai>
Date: Fri, 27 Aug 2021 13:34:01 -0700
Subject: [PATCH 7/8] grap kaxis from compute definition

---
 tutorials/optimize/opt_gemm.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 94856a6af1b5..93061f972346 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -135,7 +135,8 @@
 
 # Blocking by loop tiling
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-ko, ki = s[C].split(k, factor=kfactor)
+(kaxis,) = s[C].op.reduce_axis
+ko, ki = s[C].split(kaxis, factor=kfactor)
 
 # Hoist reduction domain outside the blocking loop
 s[C].reorder(mo, no, ko, ki, mi, ni)
@@ -169,7 +170,8 @@
 
 s = te.create_schedule(C.op)
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-ko, ki = s[C].split(k, factor=kfactor)
+(kaxis,) = s[C].op.reduce_axis
+ko, ki = s[C].split(kaxis, factor=kfactor)
 
 s[C].reorder(mo, no, ko, ki, mi, ni)
 
@@ -201,7 +203,8 @@
 
 s = te.create_schedule(C.op)
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-ko, ki = s[C].split(k, factor=kfactor)
+(kaxis,) = s[C].op.reduce_axis
+ko, ki = s[C].split(kaxis, factor=kfactor)
 
 # re-ordering
 s[C].reorder(mo, no, ko, mi, ki, ni)
@@ -258,7 +261,8 @@
 s = te.create_schedule(C.op)
 
 mo, no, mi, ni = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
-ko, ki = s[C].split(k, factor=kfactor)
+(kaxis,) = s[C].op.reduce_axis
+ko, ki = s[C].split(kaxis, factor=kfactor)
 
 s[C].reorder(mo, no, ko, mi, ki, ni)
 s[C].vectorize(ni)
@@ -303,7 +307,8 @@
 # New inner axes
 mc, nc = s[CC].op.axis
 
-ko, ki = s[CC].split(k, factor=kfactor)
+(kaxis,) = s[CC].op.reduce_axis
+ko, ki = s[CC].split(kaxis, factor=kfactor)
 s[CC].reorder(ko, mc, ki, nc)
 s[CC].vectorize(nc)
 
@@ -347,7 +352,8 @@
 
 mc, nc = s[CC].op.axis
 
-ko, ki = s[CC].split(k, factor=kfactor)
+(kaxis,) = s[CC].op.reduce_axis
+ko, ki = s[CC].split(kaxis, factor=kfactor)
 s[CC].reorder(ko, mc, ki, nc)
 s[CC].vectorize(nc)
 

From fa5ba7de4196a445afe605ce884599a00dc0513e Mon Sep 17 00:00:00 2001
From: adstraw <astraw@octoml.ai>
Date: Fri, 27 Aug 2021 13:37:10 -0700
Subject: [PATCH 8/8] remove duplicate comments on unrolling

---
 tutorials/optimize/opt_gemm.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py
index 93061f972346..5d698c612ee8 100644
--- a/tutorials/optimize/opt_gemm.py
+++ b/tutorials/optimize/opt_gemm.py
@@ -356,9 +356,6 @@
 ko, ki = s[CC].split(kaxis, factor=kfactor)
 s[CC].reorder(ko, mc, ki, nc)
 s[CC].vectorize(nc)
-
-# unroll kfactor loops
-# this is a separate optimization not discussed in this tutorial
 s[CC].unroll(ki)
 
 # parallel