diff --git a/include/l0_sampling/sketch.h b/include/l0_sampling/sketch.h
index 8746a5b4..16f61f48 100644
--- a/include/l0_sampling/sketch.h
+++ b/include/l0_sampling/sketch.h
@@ -94,7 +94,7 @@ class Sketch {
     failure_factor = _factor;
     num_buckets = bucket_gen(failure_factor);
     num_guesses = guess_gen(n);
-    num_elems = num_buckets * num_guesses + 2;  // +2 for zero depth bucket and null bucket
+    num_elems = num_buckets * num_guesses + 1;  // +1 for zero bucket optimization
   }
 
   inline static size_t sketchSizeof() {
diff --git a/src/l0_sampling/sketch.cpp b/src/l0_sampling/sketch.cpp
index 7d5c22d5..8da5314e 100644
--- a/src/l0_sampling/sketch.cpp
+++ b/src/l0_sampling/sketch.cpp
@@ -64,9 +64,8 @@ void Sketch::update(const vec_t update_idx) {
   for (unsigned i = 0; i < num_buckets; ++i) {
     col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, seed + i, num_guesses);
     size_t bucket_id = i * num_guesses + depth;
-    bucket_id *= (bool)(depth!=0); // if depth is 0 then "update" null bucket -> bucket[0]
-
-    Bucket_Boruvka::update(bucket_a[bucket_id], bucket_c[bucket_id], update_idx, checksum);
+    if (depth < num_guesses)
+      Bucket_Boruvka::update(bucket_a[bucket_id], bucket_c[bucket_id], update_idx, checksum);
   }
 }
 
@@ -83,14 +82,13 @@ std::pair<vec_t, SampleSketchRet> Sketch::query() {
   already_queried = true;
 
   if (bucket_a[num_elems - 1] == 0 && bucket_c[num_elems - 1] == 0) {
-    return {0, ZERO}; // the "first" bucket is deterministic so if it is all zero then there are no edges to return
+    return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return
   }
   if (Bucket_Boruvka::is_good(bucket_a[num_elems - 1], bucket_c[num_elems - 1], seed)) {
     return {bucket_a[num_elems - 1], GOOD};
   }
   for (unsigned i = 0; i < num_buckets; ++i) {
-    // bucket[0] is null
-    for (unsigned j = begin_nonnull; j < num_guesses; ++j) {
+    for (unsigned j = 0; j < num_guesses; ++j) {
       unsigned bucket_id = i * num_guesses + j;
       if (Bucket_Boruvka::is_good(bucket_a[bucket_id], bucket_c[bucket_id], seed)) {
         return {bucket_a[bucket_id], GOOD};
@@ -102,7 +100,7 @@ std::pair<vec_t, SampleSketchRet> Sketch::query() {
 
 Sketch &operator+= (Sketch &sketch1, const Sketch &sketch2) {
   assert (sketch1.seed == sketch2.seed);
-  for (unsigned i = Sketch::begin_nonnull; i < Sketch::num_elems; i++) {
+  for (unsigned i = 0; i < Sketch::num_elems; i++) {
     sketch1.bucket_a[i] ^= sketch2.bucket_a[i];
     sketch1.bucket_c[i] ^= sketch2.bucket_c[i];
   }
@@ -114,11 +112,11 @@ bool operator== (const Sketch &sketch1, const Sketch &sketch2) {
   if (sketch1.seed != sketch2.seed || sketch1.already_queried != sketch2.already_queried) 
     return false;
 
-  for (size_t i = Sketch::begin_nonnull; i < Sketch::num_elems; ++i) {
+  for (size_t i = 0; i < Sketch::num_elems; ++i) {
     if (sketch1.bucket_a[i] != sketch2.bucket_a[i]) return false;
   }
 
-  for (size_t i = Sketch::begin_nonnull; i < Sketch::num_elems; ++i) {
+  for (size_t i = 0; i < Sketch::num_elems; ++i) {
     if (sketch1.bucket_c[i] != sketch2.bucket_c[i]) return false;
   }
 
@@ -133,7 +131,7 @@ std::ostream& operator<< (std::ostream &os, const Sketch &sketch) {
   os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl;
 
   for (unsigned i = 0; i < Sketch::num_buckets; ++i) {
-    for (unsigned j = Sketch::begin_nonnull; j < Sketch::num_guesses; ++j) {
+    for (unsigned j = 0; j < Sketch::num_guesses; ++j) {
       unsigned bucket_id = i * Sketch::num_guesses + j;
       vec_t a      = sketch.bucket_a[bucket_id];
       vec_hash_t c = sketch.bucket_c[bucket_id];
@@ -152,7 +150,6 @@ void Sketch::write_binary(std::ostream& binary_out) {
 
 void Sketch::write_binary(std::ostream &binary_out) const {
   // Write out the bucket values to the stream.
-  // Do not include the null bucket
   binary_out.write((char*)bucket_a, num_elems * sizeof(vec_t));
   binary_out.write((char*)bucket_c, num_elems * sizeof(vec_hash_t));
 }
diff --git a/tools/benchmark/graphcc_bench.cpp b/tools/benchmark/graphcc_bench.cpp
index e8f901e8..44ab9811 100644
--- a/tools/benchmark/graphcc_bench.cpp
+++ b/tools/benchmark/graphcc_bench.cpp
@@ -103,7 +103,7 @@ static void BM_builtin_ffsl(benchmark::State& state) {
   size_t j = -1;
   for (auto _ : state) {
     benchmark::DoNotOptimize(__builtin_ffsl(i++));
-    benchmark::DoNotOptimize(__builtin_ffsl(j++));
+    benchmark::DoNotOptimize(__builtin_ffsl(j--));
   }
 }
 BENCHMARK(BM_builtin_ffsl);
@@ -113,7 +113,7 @@ static void BM_builtin_ctzl(benchmark::State& state) {
   size_t j = -1;
   for (auto _ : state) {
     benchmark::DoNotOptimize(__builtin_ctzl(i++));
-    benchmark::DoNotOptimize(__builtin_ctzl(j++));
+    benchmark::DoNotOptimize(__builtin_ctzl(j--));
   }
 }
 BENCHMARK(BM_builtin_ctzl);
@@ -123,7 +123,7 @@ static void BM_builtin_clzl(benchmark::State& state) {
   size_t j = -1;
   for (auto _ : state) {
     benchmark::DoNotOptimize(__builtin_clzl(i++));
-    benchmark::DoNotOptimize(__builtin_clzl(j++));
+    benchmark::DoNotOptimize(__builtin_clzl(j--));
   }
 }
 BENCHMARK(BM_builtin_clzl);
@@ -190,7 +190,7 @@ BENCHMARK(BM_update_bucket);
 // Benchmark the speed of updating sketches both serially and in batch mode
 static void BM_Sketch_Update(benchmark::State& state) {
   size_t vec_size = state.range(0);
-  vec_t input = vec_size / 4;
+  vec_t input = vec_size / 3;
   // initialize sketches
   Sketch::configure(vec_size, 100);
   SketchUniquePtr skt = makeSketch(seed);
@@ -239,6 +239,27 @@ static void BM_Sketch_Query(benchmark::State& state) {
 }
 BENCHMARK(BM_Sketch_Query)->DenseRange(0, 90, 10);
 
+static void BM_Supernode_Merge(benchmark::State& state) {
+  size_t n = state.range(0);
+  size_t upds = n / 100;
+  Supernode::configure(n);
+  Supernode* s1 = Supernode::makeSupernode(n, seed);
+  Supernode* s2 = Supernode::makeSupernode(n, seed);
+
+  for (size_t i = 0; i < upds; i++) {
+    s1->update(static_cast<vec_t>(concat_pairing_fn(rand() % n, rand() % n)));
+    s2->update(static_cast<vec_t>(concat_pairing_fn(rand() % n, rand() % n)));
+  }
+
+  for (auto _ : state) {
+    s1->merge(*s2);
+  }
+
+  free(s1);
+  free(s2);
+}
+BENCHMARK(BM_Supernode_Merge)->RangeMultiplier(10)->Range(1e3, 1e6);
+
 // Benchmark speed of DSU merges when the sequence of merges is adversarial
 // This means we avoid joining roots wherever possible
 static void BM_DSU_Adversarial(benchmark::State &state) {