From fb343b3844022618dd77e823370a2e86f7b49b2a Mon Sep 17 00:00:00 2001
From: Shafaq Siddiqi <shafaq.siddiqi@tugraz.at>
Date: Mon, 21 Mar 2022 13:42:49 +0100
Subject: [PATCH] [Cleaning Pipelines] MINOR improvements in allocation of
 resources  - This commit used weights for computing the number of resources
 for each bucket    the pipelines in buckets are sorted in order of accuracy
 where last buckets keeps the more    performing pipelines. If initial
 resource value is 50 then instead of assigning 50 resources    to each bucket
 this commit compute weight matrix and then assign the resources accordingly
 ensuring the    last bucket gets max resource value.  - Add MSVM as
 evaluation function  - Various cleanups and seed in bandit

---
 scripts/builtin/bandit.dml                    | 142 ++++++++++--------
 scripts/builtin/executePipeline.dml           |   4 +-
 scripts/builtin/msvm.dml                      |   7 +-
 scripts/builtin/msvmPredict.dml               |   6 +
 scripts/builtin/topk_cleaning.dml             |  12 +-
 .../pipelines/scripts/enumerateLogical.dml    |  48 +++---
 scripts/pipelines/scripts/utils.dml           |  21 +--
 .../classification/applyFunc.csv              |   6 +-
 .../intermediates/classification/bestAcc.csv  |   6 +-
 .../classification/dirtyScore.csv             |   2 +-
 .../intermediates/classification/hp.csv       |   6 +-
 .../intermediates/classification/pip.csv      |   6 +-
 .../topkcleaningClassificationTest.dml        |  45 +++++-
 13 files changed, 190 insertions(+), 121 deletions(-)

diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml
index d01755603e2..299b404e3e4 100644
--- a/scripts/builtin/bandit.dml
+++ b/scripts/builtin/bandit.dml
@@ -53,7 +53,7 @@
 
 m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList,
   String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3,
-  Integer R=50, Double baseLineScore, Boolean cv,  Integer cvk = 2, Double ref = 0, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="")
+  Integer R=50, Double baseLineScore, Boolean cv,  Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="")
   return(Boolean perf)
   # return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter) 
 {
@@ -61,38 +61,47 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
   totalPruneCount = 0
   FLAG_VARIABLE = 5
   pipelines_executed = 0
-  HYPERPARAM_LENGTH = (ncol(lp) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
+  HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col
   bestPipeline = frame("", rows=1, cols=1)
   bestHyperparams = as.matrix(0)
   bestAccuracy = as.matrix(0)
   # initialize bandit variables
   # variable names follow publication where algorithm is introduced
   eta = 2  # the halving ratio is fixed to 2
-  s_max = floor(log(R,eta)) - 1;
+  s_max = floor(log(R,eta));
+  # # compute weights for R and then increase/decrease R with respect to importance of configurations
+
+  weight = matrix(1/s_max , rows=s_max, cols=1)
+  weight = cumsum(weight)
+  # weight = matrix(1,  rows=s_max, cols=1)
+  # print("weight matrix: "+toString(weight))
   # initialize output variables
-  hparam = matrix(0, rows=k*(s_max+1), cols=HYPERPARAM_LENGTH)
-  pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1)
-  endIdx = matrix(k, rows=(s_max+1), cols=1)
+  hparam = matrix(0, rows=k*(s_max), cols=HYPERPARAM_LENGTH)
+  pipeline = matrix(0, rows=k*(s_max), cols=4)
+  endIdx = matrix(k, rows=(s_max), cols=1)
   endIdx = cumsum(endIdx)
   startIdx = (endIdx - k) + 1
 
-  n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/(s_max + 1));)
-    
-  for(s in s_max:0) { # TODO convert to parfor
+  n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/s_max);)
+  pipelineId = as.frame(seq(1, nrow(lp)))
+  lp = cbind(pipelineId, lp)
+  mainLookup = lp
+  B = (s_max + 1) * R;
+  s_max = s_max - 1
+  idx = 1
+  for(s in s_max:0, check = 0) { # TODO convert to parfor
     
    # result variables
     bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH)
-    bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=3)
+    bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=4)
     start=1; end=0;
-    
     # # compute the number of initial pipelines n
-    r = R * eta^(-s);
+    r = max(R * as.scalar(weight[((s_max - s) + 1)]) * eta^(-s), 1);
     configurations = lp[1:(min(n, nrow(lp)))]
+
     # append configuration keys for extracting the pipeline later on
     id = seq(1, nrow(configurations))
     configurations = cbind(as.frame(id), configurations)
-    # save the original configuration as a lookup table
-    lookup = configurations
 
     for(i in 0:s) {
       # successive halving
@@ -107,11 +116,13 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
       configurations = configurations[1:n_i, ]
       pipelines_executed = pipelines_executed + (n_i * r_i)
       [outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList,
-        evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, enablePruning=enablePruning)
+        evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning)
       totalPruneCount = totalPruneCount + pruneCount
       # sort the pipelines by order of accuracy decreasing
-      a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE)
-      b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE)
+      IX = order(target = outPip, by = 1, decreasing=TRUE, index.return=TRUE)
+      P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(outPip));
+      a = P %*% outPip
+      b = P %*% outHp
       rowIndex = min(k, nrow(a))
 
       # maintain the brackets results
@@ -122,8 +133,9 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
 
       # sort the configurations for successive halving
       avergae_perf =  getMaxPerConf(outPip, nrow(configurations)) 
-      sortMask = matrix(1, rows=1, cols=ncol(configurations))
-      configurations = frameSort(cbind(avergae_perf, configurations), cbind(as.matrix(0), sortMask), TRUE)
+      sortMask = matrix(1, rows=1, cols=ncol(configurations) + 1)
+      sortMask[1,1] = 0
+      configurations = frameSort(cbind(avergae_perf, configurations), sortMask, TRUE)
       configurations = configurations[, 2:ncol(configurations)]
     }
     if(n < nrow(lp))
@@ -131,17 +143,17 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl
     bracket_pipel = removeEmpty(target=bracket_pipel, margin="rows")
     bracket_hp = removeEmpty(target=bracket_hp, margin="rows")
     # keep the best k results for each bracket
-    [bracket_bestPipeline, bracket_bestHyperparams] = extractBracketWinners(bracket_pipel, bracket_hp, k, lookup)
+    [bracket_bestPipeline, bracket_bestHyperparams] = extractBracketWinners(bracket_pipel, bracket_hp, k)
     # optimize by the features
-    startOut = as.scalar(startIdx[s+1])
-    endOut = min(as.scalar(endIdx[s+1]), (startOut + nrow(bracket_bestPipeline) - 1))
+    startOut = as.scalar(startIdx[idx])
+    endOut = min(as.scalar(endIdx[idx]), (startOut + nrow(bracket_bestPipeline) - 1))
     pipeline[startOut:endOut, ] = bracket_bestPipeline
     hparam[startOut:endOut, 1:ncol(bracket_bestHyperparams)] = bracket_bestHyperparams
+    idx = idx + 1
+    # print("bracket best: \n"+toString(bracket_bestPipeline))
   }
-  [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, baseLineScore, k)
-  bestAccuracy = as.matrix(bestPipeline[,1])
-  bestHyperparams = bestHyperparams[,2:ncol(bestHyperparams)]
-  bestPipeline = bestPipeline[, 2:ncol(bestPipeline)]
+  [bestPipeline, bestHyperparams, bestAccuracy] = extractTopK(pipeline, hparam, baseLineScore, k, mainLookup)
+
   imp = as.double(as.scalar(bestAccuracy[1, 1])) - as.double(baseLineScore)
   perf = imp > 0
   applyFunc = bestPipeline
@@ -212,31 +224,32 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon
 }
 
 # this method will call the execute pipelines with their hyper-parameters
-run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double] X, Matrix[Double] Y,
+run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Double] X, Matrix[Double] Y,
   Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
-  Frame[Unknown] param, Boolean cv,  Integer cvk = 2, Double ref = 0, Boolean enablePruning = FALSE, Boolean default = FALSE)
+  Frame[Unknown] param, Boolean cv = FALSE,  Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE)
   return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Integer pruneCount, Matrix[Double] changesByPipMatrix)
 {
+  # # # TODO there is a partial overlap but it is negligible so we will not rewrite the scripts but lineage based reuse will get rid of it
   changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1)
   pruneCount = 0
-  output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)-1) * 5 * 3)
+  output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3)
   output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1)
-  output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2)
+  output_pipelines = matrix(0, nrow(ph_pip)*r_i, 3)
   # rows in validation set
   clone_X = X
   clone_Y = Y
   clone_Xtest = Xtest
   clone_Ytest = Ytest
   index = 1
-  id = as.matrix(ph_pip[, 1])
-  ph_pip = ph_pip[, 2:ncol(ph_pip)]
+  ids = as.matrix(ph_pip[, 1:2])
+  ph_pip = ph_pip[, 3:ncol(ph_pip)]
 
   parfor(i in 1:nrow(ph_pip), check = 0)
   {
     # execute configurations with r resources
     op = removeEmpty(target=ph_pip[i], margin="cols")
-    print("PIPELINE EXECUTION START ... "+toString(op))
-    [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, param, r_i, default, enablePruning)
+    # print("PIPELINE EXECUTION START ... "+toString(op))
+    [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, param, r_i, default, seed, enablePruning)
     hpForPruning = matrix(0, rows=1, cols=ncol(op))
     changesByOp = matrix(0, rows=1, cols=ncol(op))
     metaList2 = metaList; #ensure metaList is no result var
@@ -279,13 +292,12 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double
         }
 
         # evalFunOutput = eval(evaluationFunc, argList)  
-        accT = floor((time() - t1) / 1e+6)  
         matrix_width = as.matrix(nrow(hp_matrix) * ncol(hp_matrix))
         hp_vec = cbind(matrix_width, matrix(hp_matrix, rows=1, cols=nrow(hp_matrix)*ncol(hp_matrix), byrow=TRUE))
         index = (i - 1) * no_of_res + r
         output_accuracy[index, 1] = accuracy
         output_hp[index, 1:ncol(hp_vec)] = hp_vec
-        output_pipelines[index, ] = cbind(as.matrix(index), id[i,1])
+        output_pipelines[index, ] = cbind(as.matrix(index), ids[i,1:2])
       }
       else
       {
@@ -303,10 +315,9 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double
 }
 
 # extract the hyper-parameters for pipelines
-getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Integer no_of_res, Boolean default, Boolean enablePruning)
+getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Integer no_of_res, Boolean default, Integer seed = -1, Boolean enablePruning)
   return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer no_of_res, Integer NUM_META_FLAGS)
 {
-
   allParam = 0;
   NUM_META_FLAGS = 5
   NUM_DEFAULT_VALUES = 4
@@ -330,7 +341,7 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Intege
   # this matrix stores no. of hps, values of hps, and flags
   paramMatrix = matrix(0, rows=ncol(pipeline)*no_of_res, cols=max(paramCount)+NUM_META_FLAGS+1)
 
-  for(i in 1:ncol(pipeline)) {
+  parfor(i in 1:ncol(pipeline), check=0) {
     op = as.scalar(pipeline[1, i])
     index = as.scalar(indexes[i])
     no_of_param = as.integer(as.scalar(paramCount[i]))
@@ -354,21 +365,21 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Intege
           minVal =  as.scalar(hpList[index, paramValIndex])
           maxVal = as.scalar(hpList[index, paramValIndex + 1])
           if(type == "FP") {
-            val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform");
+            val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform", seed=seed);
             OpParam[, j] = val;
           }
           else if(type == "INT") {
             if(as.integer(maxVal) > no_of_res)
-              val = sample(as.integer(maxVal), no_of_res, FALSE)
+              val = sample(as.integer(maxVal), no_of_res, FALSE, seed)
             else 
-              val = sample(as.integer(maxVal), no_of_res, TRUE)
+              val = sample(as.integer(maxVal), no_of_res, TRUE, seed)
             less_than_min = val < as.integer(minVal);
             val = (less_than_min * minVal) + val;
             OpParam[, j] = val;
           }
           else if(type == "BOOL") {
             if(maxVal == 1) {
-              s = sample(2, no_of_res, TRUE);
+              s = sample(2, no_of_res, TRUE, seed);
               b = s - 1;
               OpParam[, j] = b;
             } 
@@ -400,25 +411,43 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown]  hpList, Intege
 
 
 # extract the top k pipelines as a final result after deduplication and sorting
-extractTopK = function(Frame[Unknown] pipeline, Matrix[Double] hyperparam, 
-  Double baseLineScore, Integer k)
-  return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams)
+extractTopK = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, 
+  Double baseLineScore, Integer k, Frame[Unknown] mainLookup)
+  return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy)
 {
-  hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE)
-  pipeline = frameSort(pipeline, cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(pipeline) - 1)), TRUE)
+  IX = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=TRUE)
+  P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(hyperparam));
+  hyperparam = P %*% hyperparam
+  pipeline = P %*% pipeline
+
   # remove the row with accuracy less than test accuracy 
   mask = (hyperparam[, 1] < baseLineScore) == 0
+  if(sum(mask) == 0)
+    mask[1, 1] = 1
   hyperparam = removeEmpty(target = hyperparam, margin = "rows", select = mask)
+  pipeline = removeEmpty(target = pipeline, margin = "rows", select = mask)
+ 
   rowIndex = min(nrow(hyperparam), k)
   # select the top k
-  bestPipeline = pipeline[1:rowIndex,]
-  bestHyperparams = hyperparam[1:rowIndex,]  
+  bestAccuracy = pipeline[1:rowIndex, 1]
+  bestHyperparams = hyperparam[1:rowIndex, 2:ncol(hyperparam)] 
+  pipeline = pipeline[1:rowIndex]
+  # # # lookup for the pipelines
+  pipCode = pipeline[, ncol(pipeline)]
+  
+  bestPipeline = frame(data="0", rows=nrow(pipeline), cols=ncol(mainLookup))
+  parfor(i in 1: nrow(pipeline)) {
+    index = as.scalar(pipCode[i])
+    bestPipeline[i] = mainLookup[index]
+  }
+  
+  bestPipeline = bestPipeline[, 2:ncol(bestPipeline)]
+
 }
 
 # extract the top k pipelines for each bracket, the intermediate results
-extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, 
-  Integer k, Frame[String] conf)
-  return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams)
+extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, Integer k)
+  return (Matrix[Double] bestPipeline, Matrix[Double] bestHyperparams)
 {
   # bestPipeline = frameSort(bestPipeline)
   hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE)
@@ -427,12 +456,7 @@ extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperpa
 
   pipeline = pipeline[1:rowIndex,]
   bestHyperparams = hyperparam[1:rowIndex,]
-  bestPipeline = frame(data="0", rows=nrow(pipeline), cols=ncol(conf))
-  parfor(i in 1: nrow(pipeline)) {
-    index = as.scalar(pipeline[i, 3])
-    bestPipeline[i] = conf[index]
-    bestPipeline[i, 1] = as.frame(pipeline[i, 1])
-  }
+  bestPipeline = pipeline[1:rowIndex]
 }
 
 ###########################################################################
diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml
index 7226b761b49..a606df9a465 100644
--- a/scripts/builtin/executePipeline.dml
+++ b/scripts/builtin/executePipeline.dml
@@ -375,8 +375,8 @@ SMOTE  = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, Integ
 return (Matrix[Double] X, Matrix[Double] Y)
 {
   # get the class count 
-  for(k in 1:max(Y)) {
-    classes = table(Y, 1)
+  classes = table(Y, 1)
+  for(k in 1:nrow(classes) - 1) {
     minClass = min(classes)
     maxClass = max(classes)
     diff = (maxClass - minClass)/sum(classes)
diff --git a/scripts/builtin/msvm.dml b/scripts/builtin/msvm.dml
index 477528866d7..921d694e447 100644
--- a/scripts/builtin/msvm.dml
+++ b/scripts/builtin/msvm.dml
@@ -53,7 +53,12 @@ m_msvm = function(Matrix[Double] X, Matrix[Double] Y, Boolean intercept = FALSE,
     stop("MSVM: Invalid Y input, containing negative values")
   if(verbose)
     print("Running Multiclass-SVM")
-
+  # Robustness for datasets with missing values (causing NaN gradients)
+  numNaNs = sum(isNaN(X))
+  if( numNaNs > 0 ) {
+    print("msvm: matrix X contains "+numNaNs+" missing values, replacing with 0.")
+    X = replace(target=X, pattern=NaN, replacement=0);
+  }
   num_rows_in_w = ncol(X)
   if(intercept) {
     # append once, and call l2svm always with intercept=FALSE 
diff --git a/scripts/builtin/msvmPredict.dml b/scripts/builtin/msvmPredict.dml
index 4c7460ffdb7..d34d086eddf 100644
--- a/scripts/builtin/msvmPredict.dml
+++ b/scripts/builtin/msvmPredict.dml
@@ -41,6 +41,12 @@
 m_msvmPredict = function(Matrix[Double] X, Matrix[Double] W)
   return(Matrix[Double] YRaw, Matrix[Double] Y)
 {
+  # Robustness for datasets with missing values 
+  numNaNs = sum(isNaN(X))
+  if( numNaNs > 0 ) {
+    print("msvm: matrix X contains "+numNaNs+" missing values, replacing with 0.")
+    X = replace(target=X, pattern=NaN, replacement=0);
+  }
   if(ncol(X) != nrow(W)){
     if(ncol(X) + 1 != nrow(W)){
       stop("MSVM Predict: Invalid shape of W ["+ncol(W)+","+nrow(W)+"] or X ["+ncol(X)+","+nrow(X)+"]")
diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml
index e6739d8bca4..1b983faa0bf 100644
--- a/scripts/builtin/topk_cleaning.dml
+++ b/scripts/builtin/topk_cleaning.dml
@@ -57,8 +57,9 @@ source("scripts/pipelines/scripts/enumerateLogical.dml") as lg;
 source("scripts/builtin/bandit.dml") as bandit;
 
 s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives,
-  Frame[Unknown] parameters, String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, Double sample = 1.0,
-  Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE, String output)
+  Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10,
+  Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE,
+  String output)
   return(Boolean perf)
   # return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Frame[Unknown] bestLogical,
   # Frame[Unknown] features, Double dirtyScore, Matrix[Double] evalFunHp)
@@ -138,18 +139,17 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a
   metaList['distY'] = dist
 
   print("-- Cleaning - Enum Logical Pipelines: ");
-  [bestLogical, con, refChanges] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
-  initial_population=logical, seed = seed,  max_iter=max_iter, metaList = metaList,
+  [bestLogical, con, refChanges, acc] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest,
+  initial_population=logical, refSol=refSol, seed = seed,  max_iter=max_iter, metaList = metaList,
   evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters,
   dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx)
   t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s");
-
   topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); features = as.frame("NULL")
   
   # # [topKPipelines, topKHyperParams, topKScores, features] = 
   perf = bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest,  metaList=metaList,
     evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, primitives=primitives, param=parameters, baseLineScore=dirtyScore,
-    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, enablePruning = enablePruning, output=output, verbose=TRUE);  
+    k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, enablePruning = enablePruning, output=output, verbose=TRUE);  
   t7 = time(); print("-- Cleaning - Enum Physical Pipelines: "+(t7-t6)/1e9+"s");
 }
 
diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml
index 35e5def4b04..4e4d436f4ce 100644
--- a/scripts/pipelines/scripts/enumerateLogical.dml
+++ b/scripts/pipelines/scripts/enumerateLogical.dml
@@ -52,32 +52,30 @@
 source("scripts/builtin/bandit.dml") as bandit;
 
 enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] Xtest, Matrix[Double] ytest,
-  Frame[Unknown] initial_population, Integer seed = -1, Integer max_iter=10, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
+  Frame[Unknown] initial_population, Frame[String] refSol = as.frame("NaN"), Integer seed = -1, Integer max_iter=10, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp,
   Frame[Unknown] primitives, Frame[Unknown] param, Double dirtyScore = 79, Boolean cv=FALSE, Boolean cvk=3,
   Boolean verbose, List[Unknown] ctx=list(prefix="----"))
-return (Frame[Unknown] output, boolean converged, Double refChanges)
+return (Frame[Unknown] output, boolean converged, Double refChanges, Frame[Unknown] acc)
 {
 
   finalOutput = list()
   mask = as.matrix(metaList['mask'])
-  num_exec = 1
   prefix = as.scalar(ctx["prefix"]);  
   iter = 1
   populationLength = 0
   converged = FALSE
   start = 1; 
   end = 0;
-  [allOps, ref] = getOps(param[, 2], as.scalar(metaList['distY']), nrow(y), min(y))
+  [allOps, ref] = getOps(param[, 2], refSol, as.scalar(metaList['distY']), nrow(y), min(y))
 
   # unrolled by physical pipelines
   pipelines = frame(0, rows=nrow(primitives)^ncol(primitives), cols=max(ncol(initial_population), ncol(ref)))
-  parfor(i in 1:nrow(initial_population), check = 0) { 
+  for(i in 1:nrow(initial_population)) { 
     pconf = bandit::get_physical_configurations(initial_population[i], 0, primitives)
     end = end + nrow(pconf)
     pipelines[start:end, 1:ncol(pconf)] = pconf
     start = end + 1
   }
-
   pipelines = removeEmpty(target = pipelines, margin="rows") 
   if(sum(mask) > 0)
   {
@@ -94,11 +92,11 @@ return (Frame[Unknown] output, boolean converged, Double refChanges)
   while(!converged & iter <= max_iter)
   {
     populationLength = max(populationLength, ncol(population))
-    id = seq(1, nrow(population))
+    id = matrix(seq(1, nrow(population)*2), rows=nrow(population), cols=2)
     print(prefix+" EnumLP iteration "+iter+"/"+as.integer(max_iter)+":" );
     # # # execute the physical pipelines
-    [outPip, outHp, p, refChanges] = bandit::run_with_hyperparam(cbind(as.frame(id), population), 
-      num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp, param, cv, cvk, 0, FALSE, TRUE)
+    [outPip, outHp, p, refChanges] = bandit::run_with_hyperparam(ph_pip=cbind(as.frame(id), population),
+      X=X, Y=y, Xtest=Xtest, Ytest=ytest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, seed=seed, default=TRUE)
     # # sort the configurations score-wise
     actPip = cbind(as.frame(outPip[, 1]), as.frame(refChanges))
     actPip = cbind(actPip, population)
@@ -115,7 +113,7 @@ return (Frame[Unknown] output, boolean converged, Double refChanges)
     children = frame(0, rows=populationSize, cols=ncol(sortedPipelines))
     sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)]
     # # randomly pick the pipelines for transitions
-    pipRand = sample(nrow(sortedPipelines), populationSize, TRUE)
+    pipRand = sample(nrow(sortedPipelines), populationSize, TRUE, seed)
     if(!converged) {
       parfor(i in 1:nrow(children), check=0) {
         idxR = (nrow(children) * (iter - 1)) + i
@@ -129,7 +127,7 @@ return (Frame[Unknown] output, boolean converged, Double refChanges)
         if(random == 1)
           c1 = addition(top, allOps[as.scalar(opToAdd[idxR])]) 
         else if(random == 2)
-          c1 = mutation(top) 
+          c1 = mutation(top, seed) 
         else if(random == 3)
           c1 = removal(top, as.scalar(opToRemove[idxR])) 
         
@@ -159,22 +157,30 @@ return (Frame[Unknown] output, boolean converged, Double refChanges)
   output = removeEmpty(target=output, margin="rows")
   output = frameSort(output, sort_mask, FALSE)
   refChanges = as.double(as.scalar(output[nrow(output), 2]))
-  halfing = max(floor(nrow(output)/2), 1)
-  output = output[halfing:nrow(output), 3:ncol(output)]
+  # halfing = max(floor(nrow(output)/2), 1)
+  acc = output[, 1]
+  # print("# of logical pipelines: "+nrow(output))
+  # print("max accuracy: "+toString(acc))
+  output = output[,3:ncol(output)]
+  # print("logical pipelines: "+toString(output))
 }
 
-addition = function(Frame[Unknown] top, Frame[Unknown] allOps)
-return (Frame [Unknown] child)
+addition = function(Frame[Unknown] top, Frame[Unknown] opToAdd)
+return (Frame[Unknown] child)
 {
-  child = cbind(allOps, top)
+  # # # never add same operation adjacent to each other
+  if(as.scalar(top[1,1]) != as.scalar(opToAdd[1,1]))
+    child = cbind(opToAdd, top)
+  else 
+    child = cbind(top, opToAdd)
 }
 
-mutation = function(Frame[Unknown] child)
+mutation = function(Frame[Unknown] child, Integer seed)
 return (Frame [Unknown] mChild)
 {
   if(ncol(child) >= 2)
   {
-    r = sample(ncol(child), 2)
+    r = sample(ncol(child), 2, FALSE, seed)
     r1 = as.scalar(r[1,1])
     r2 = as.scalar(r[2,1])
     temp = child[1, r1]
@@ -195,8 +201,8 @@ return (Frame[Unknown] child)
   }
 }
 
-getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer minValue)
- return (Frame[String] allOps, Frame[String] ref) {
+getOps = function( Frame[string] allOps, Frame[String] refSol, Integer dist, Integer n, Integer minValue)
+ return (Frame[String] allOps, Frame[String] refSol) {
  
   # # # TODO fix the following hard-coded condition by taking a file input
   # # allOps are the operation which are randomly added to a population, for now I am reusing param file
@@ -214,5 +220,7 @@ getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer minVal
     # & !x.equals(\"mice\") & !x.equals(\"dbscan\")
     ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3)
   }
+  if(as.scalar(refSol[1,1]) == "NaN")
+    refSol = ref
   allOps = removeEmpty(target=allOps, margin="rows")
 }
\ No newline at end of file
diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml
index 4a9df756458..f6060e6a0f5 100644
--- a/scripts/pipelines/scripts/utils.dml
+++ b/scripts/pipelines/scripts/utils.dml
@@ -202,14 +202,12 @@ return(Frame[Unknown] data)
   # step 2 fix invalid lengths
   # q0 = 0.05
   # q1 = 0.95
-  # print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile");
 
-  # [train, mask, qlow, qup] = fixInvalidLengths(train, mask, q0, q1)
+  # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1)
 
   
   # step 3 fix swap values
-  # print(prefix+" value swap fixing");
-  # train = valueSwap(train, schema)
+  # data = valueSwap(data, schema)
 
   # step 3 drop invalid types
   data = dropInvalidType(data, schema)
@@ -236,17 +234,4 @@ return(Frame[Unknown] data)
       data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2)
     }
   }
-}
-
-
-
-
-
-
-
-
-
-
-
-
-
+}
\ No newline at end of file
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
index 714d5357b5d..d3961daae3c 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv
@@ -1,3 +1,3 @@
-imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply
-imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply
-imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply
+winsorizeApply,NA,imputeByMedianApply,dummycodingApply,0
+imputeByFdApply,winsorizeApply,imputeByMedianApply,NA,dummycodingApply
+NA,imputeByMedianApply,dummycodingApply,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
index ba11a229c21..cb6a8ae3001 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv
@@ -1,3 +1,3 @@
-68.84057971014492
-68.84057971014492
-68.65942028985508
+88.28828828828829
+88.28828828828829
+87.38738738738738
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
index 4e5b1a5042c..d70d1d19535 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv
@@ -1 +1 @@
-61.050724637681164
\ No newline at end of file
+71.17117117117117
\ No newline at end of file
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
index 5dba366c69e..5e02cbb7d5f 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv
@@ -1,3 +1,3 @@
-32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.017866171174338655,0.9722754538748367,0,0,0,1.0,0,2.0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.04096822615526508,0.9724536097500497,0,0,0,1.0,0,2.0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.04922407814925073,0.973233625102309,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+32.0,2.0,0.04153685307086108,0.9894210663385763,0,0,0,1.0,0,1.0,0.49421066338576347,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+40.0,1.0,0.8365263980314581,0,0,1.0,0,0,1.0,2.0,0.04153685307086108,0.9894210663385763,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,1.0,0.49421066338576347,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+21.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
index 3738bc56fd1..f1492ae0577 100644
--- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
+++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv
@@ -1,3 +1,3 @@
-imputeByMean,winsorize,scale,dummycoding
-imputeByMean,winsorize,scale,dummycoding
-imputeByMean,winsorize,scale,dummycoding
+winsorize,underSampling,imputeByMedian,dummycoding,0
+imputeByFd,winsorize,imputeByMedian,underSampling,dummycoding
+underSampling,imputeByMedian,dummycoding,0,0
diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
index 56a82c8bec9..3ccb2c0f7b9 100644
--- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
+++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml
@@ -57,9 +57,9 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)]
 # # # split in train/test 70/30
 
 # [topKPipelines, topKHyperParams, topKScores, bestLogical, features, dirtyScore, evalHp] = 
-result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param,
+result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3),
   evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources,
-  expectedIncrease=expectedIncrease, seed = 42, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) 
+  expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) 
 
 write(result, $O)
 
@@ -101,4 +101,45 @@ return(Matrix[Double] output, Matrix[Double] error)
 accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
   [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE);
   err = as.matrix(1-(acc/100));
+}
+
+
+evalClassificationMSVM = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0),
+  Matrix[Double] evalFunHp)
+return(Matrix[Double] output, Matrix[Double] error)
+{
+  if(is.na(as.scalar(evalFunHp[1,1])))
+  {
+    nc = max(Y);
+    params = list("intercept", "lambda", "epsilon")
+    paramRanges = list(seq(0, 1), 10^seq(1,-3), 10^seq(1,-5));
+    trainArgs = list(X=X, Y=Y, intercept=-1, lambda=-1, epsilon=-1, maxIterations=1000,  verbose=FALSE);
+    dataArgs = list("X", "Y");
+    [B1,opt] = gridSearch(X=X, y=Y, train="msvm", predict="accuracyMSVM", numB=(ncol(X)+1)*(nc),
+      params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=TRUE);
+    evalFunHp = as.matrix(opt) # opt #
+    # opt = matrix("2 10 0.001", rows=1, cols=3)
+    # evalFunHp = opt
+  }
+  if(min(Y) == max(Y))
+  {
+    accuracy = as.matrix(0)
+    a = 0
+  }
+  else {
+    beta = msvm(X=X, Y=Y, intercept=as.scalar(evalFunHp[1,1]), lambda=as.scalar(evalFunHp[1,2]), epsilon=as.scalar(evalFunHp[1,3]), 
+      maxIterations=1000, verbose=FALSE);
+    yhat = msvmPredict(X=Xtest, W=beta);
+    yhat = rowIndexMax(yhat)
+    accuracy = mean(yhat == Ytest) * 100
+    error = yhat != Ytest
+    accuracy = as.matrix(accuracy)
+  }
+  output = cbind(accuracy, evalFunHp)
+}
+accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) {
+  yhat = msvmPredict(X=X, W=B);
+  yhat = rowIndexMax(yhat)
+  acc = mean(yhat == y)
+  err = as.matrix(1-(acc));
 }
\ No newline at end of file