From fb343b3844022618dd77e823370a2e86f7b49b2a Mon Sep 17 00:00:00 2001 From: Shafaq Siddiqi Date: Mon, 21 Mar 2022 13:42:49 +0100 Subject: [PATCH] [Cleaning Pipelines] MINOR improvements in allocation of resources - This commit used weights for computing the number of resources for each bucket the pipelines in buckets are sorted in order of accuracy where last buckets keeps the more performing pipelines. If initial resource value is 50 then instead of assigning 50 resources to each bucket this commit compute weight matrix and then assign the resources accordingly ensuring the last bucket gets max resource value. - Add MSVM as evaluation function - Various cleanups and seed in bandit --- scripts/builtin/bandit.dml | 142 ++++++++++-------- scripts/builtin/executePipeline.dml | 4 +- scripts/builtin/msvm.dml | 7 +- scripts/builtin/msvmPredict.dml | 6 + scripts/builtin/topk_cleaning.dml | 12 +- .../pipelines/scripts/enumerateLogical.dml | 48 +++--- scripts/pipelines/scripts/utils.dml | 21 +-- .../classification/applyFunc.csv | 6 +- .../intermediates/classification/bestAcc.csv | 6 +- .../classification/dirtyScore.csv | 2 +- .../intermediates/classification/hp.csv | 6 +- .../intermediates/classification/pip.csv | 6 +- .../topkcleaningClassificationTest.dml | 45 +++++- 13 files changed, 190 insertions(+), 121 deletions(-) diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml index d01755603e2..299b404e3e4 100644 --- a/scripts/builtin/bandit.dml +++ b/scripts/builtin/bandit.dml @@ -53,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3, - Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref = 0, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="") + Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="") return(Boolean perf) # return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter) { @@ -61,38 +61,47 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl totalPruneCount = 0 FLAG_VARIABLE = 5 pipelines_executed = 0 - HYPERPARAM_LENGTH = (ncol(lp) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col + HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col bestPipeline = frame("", rows=1, cols=1) bestHyperparams = as.matrix(0) bestAccuracy = as.matrix(0) # initialize bandit variables # variable names follow publication where algorithm is introduced eta = 2 # the halving ratio is fixed to 2 - s_max = floor(log(R,eta)) - 1; + s_max = floor(log(R,eta)); + # # compute weights for R and then increase/decrease R with respect to importance of configurations + + weight = matrix(1/s_max , rows=s_max, cols=1) + weight = cumsum(weight) + # weight = matrix(1, rows=s_max, cols=1) + # print("weight matrix: "+toString(weight)) # initialize output variables - hparam = matrix(0, rows=k*(s_max+1), cols=HYPERPARAM_LENGTH) - pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1) - endIdx = matrix(k, rows=(s_max+1), cols=1) + hparam = matrix(0, rows=k*(s_max), cols=HYPERPARAM_LENGTH) + pipeline = matrix(0, rows=k*(s_max), cols=4) + endIdx = matrix(k, rows=(s_max), cols=1) endIdx = cumsum(endIdx) startIdx = (endIdx - k) + 1 - n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/(s_max + 1));) - - for(s in s_max:0) { # TODO convert to parfor + n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/s_max);) + pipelineId = as.frame(seq(1, nrow(lp))) + lp = cbind(pipelineId, lp) + mainLookup = lp + B = (s_max + 1) * R; + s_max = s_max - 1 + idx = 1 + for(s in s_max:0, check = 0) { # TODO convert to parfor # result variables bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH) - bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=3) + bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=4) start=1; end=0; - # # compute the number of initial pipelines n - r = R * eta^(-s); + r = max(R * as.scalar(weight[((s_max - s) + 1)]) * eta^(-s), 1); configurations = lp[1:(min(n, nrow(lp)))] + # append configuration keys for extracting the pipeline later on id = seq(1, nrow(configurations)) configurations = cbind(as.frame(id), configurations) - # save the original configuration as a lookup table - lookup = configurations for(i in 0:s) { # successive halving @@ -107,11 +116,13 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl configurations = configurations[1:n_i, ] pipelines_executed = pipelines_executed + (n_i * r_i) [outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList, - evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, enablePruning=enablePruning) + evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning) totalPruneCount = totalPruneCount + pruneCount # sort the pipelines by order of accuracy decreasing - a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE) - b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE) + IX = order(target = outPip, by = 1, decreasing=TRUE, index.return=TRUE) + P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(outPip)); + a = P %*% outPip + b = P %*% outHp rowIndex = min(k, nrow(a)) # maintain the brackets results @@ -122,8 +133,9 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl # sort the configurations for successive halving avergae_perf = getMaxPerConf(outPip, nrow(configurations)) - sortMask = matrix(1, rows=1, cols=ncol(configurations)) - configurations = frameSort(cbind(avergae_perf, configurations), cbind(as.matrix(0), sortMask), TRUE) + sortMask = matrix(1, rows=1, cols=ncol(configurations) + 1) + sortMask[1,1] = 0 + configurations = frameSort(cbind(avergae_perf, configurations), sortMask, TRUE) configurations = configurations[, 2:ncol(configurations)] } if(n < nrow(lp)) @@ -131,17 +143,17 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl bracket_pipel = removeEmpty(target=bracket_pipel, margin="rows") bracket_hp = removeEmpty(target=bracket_hp, margin="rows") # keep the best k results for each bracket - [bracket_bestPipeline, bracket_bestHyperparams] = extractBracketWinners(bracket_pipel, bracket_hp, k, lookup) + [bracket_bestPipeline, bracket_bestHyperparams] = extractBracketWinners(bracket_pipel, bracket_hp, k) # optimize by the features - startOut = as.scalar(startIdx[s+1]) - endOut = min(as.scalar(endIdx[s+1]), (startOut + nrow(bracket_bestPipeline) - 1)) + startOut = as.scalar(startIdx[idx]) + endOut = min(as.scalar(endIdx[idx]), (startOut + nrow(bracket_bestPipeline) - 1)) pipeline[startOut:endOut, ] = bracket_bestPipeline hparam[startOut:endOut, 1:ncol(bracket_bestHyperparams)] = bracket_bestHyperparams + idx = idx + 1 + # print("bracket best: \n"+toString(bracket_bestPipeline)) } - [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, baseLineScore, k) - bestAccuracy = as.matrix(bestPipeline[,1]) - bestHyperparams = bestHyperparams[,2:ncol(bestHyperparams)] - bestPipeline = bestPipeline[, 2:ncol(bestPipeline)] + [bestPipeline, bestHyperparams, bestAccuracy] = extractTopK(pipeline, hparam, baseLineScore, k, mainLookup) + imp = as.double(as.scalar(bestAccuracy[1, 1])) - as.double(baseLineScore) perf = imp > 0 applyFunc = bestPipeline @@ -212,31 +224,32 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon } # this method will call the execute pipelines with their hyper-parameters -run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double] X, Matrix[Double] Y, +run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, - Frame[Unknown] param, Boolean cv, Integer cvk = 2, Double ref = 0, Boolean enablePruning = FALSE, Boolean default = FALSE) + Frame[Unknown] param, Boolean cv = FALSE, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE) return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Integer pruneCount, Matrix[Double] changesByPipMatrix) { + # # # TODO there is a partial overlap but it is negligible so we will not rewrite the scripts but lineage based reuse will get rid of it changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1) pruneCount = 0 - output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)-1) * 5 * 3) + output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3) output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1) - output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2) + output_pipelines = matrix(0, nrow(ph_pip)*r_i, 3) # rows in validation set clone_X = X clone_Y = Y clone_Xtest = Xtest clone_Ytest = Ytest index = 1 - id = as.matrix(ph_pip[, 1]) - ph_pip = ph_pip[, 2:ncol(ph_pip)] + ids = as.matrix(ph_pip[, 1:2]) + ph_pip = ph_pip[, 3:ncol(ph_pip)] parfor(i in 1:nrow(ph_pip), check = 0) { # execute configurations with r resources op = removeEmpty(target=ph_pip[i], margin="cols") - print("PIPELINE EXECUTION START ... "+toString(op)) - [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, param, r_i, default, enablePruning) + # print("PIPELINE EXECUTION START ... "+toString(op)) + [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, param, r_i, default, seed, enablePruning) hpForPruning = matrix(0, rows=1, cols=ncol(op)) changesByOp = matrix(0, rows=1, cols=ncol(op)) metaList2 = metaList; #ensure metaList is no result var @@ -279,13 +292,12 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double } # evalFunOutput = eval(evaluationFunc, argList) - accT = floor((time() - t1) / 1e+6) matrix_width = as.matrix(nrow(hp_matrix) * ncol(hp_matrix)) hp_vec = cbind(matrix_width, matrix(hp_matrix, rows=1, cols=nrow(hp_matrix)*ncol(hp_matrix), byrow=TRUE)) index = (i - 1) * no_of_res + r output_accuracy[index, 1] = accuracy output_hp[index, 1:ncol(hp_vec)] = hp_vec - output_pipelines[index, ] = cbind(as.matrix(index), id[i,1]) + output_pipelines[index, ] = cbind(as.matrix(index), ids[i,1:2]) } else { @@ -303,10 +315,9 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double } # extract the hyper-parameters for pipelines -getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Integer no_of_res, Boolean default, Boolean enablePruning) +getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Integer no_of_res, Boolean default, Integer seed = -1, Boolean enablePruning) return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer no_of_res, Integer NUM_META_FLAGS) { - allParam = 0; NUM_META_FLAGS = 5 NUM_DEFAULT_VALUES = 4 @@ -330,7 +341,7 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Intege # this matrix stores no. of hps, values of hps, and flags paramMatrix = matrix(0, rows=ncol(pipeline)*no_of_res, cols=max(paramCount)+NUM_META_FLAGS+1) - for(i in 1:ncol(pipeline)) { + parfor(i in 1:ncol(pipeline), check=0) { op = as.scalar(pipeline[1, i]) index = as.scalar(indexes[i]) no_of_param = as.integer(as.scalar(paramCount[i])) @@ -354,21 +365,21 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Intege minVal = as.scalar(hpList[index, paramValIndex]) maxVal = as.scalar(hpList[index, paramValIndex + 1]) if(type == "FP") { - val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform"); + val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform", seed=seed); OpParam[, j] = val; } else if(type == "INT") { if(as.integer(maxVal) > no_of_res) - val = sample(as.integer(maxVal), no_of_res, FALSE) + val = sample(as.integer(maxVal), no_of_res, FALSE, seed) else - val = sample(as.integer(maxVal), no_of_res, TRUE) + val = sample(as.integer(maxVal), no_of_res, TRUE, seed) less_than_min = val < as.integer(minVal); val = (less_than_min * minVal) + val; OpParam[, j] = val; } else if(type == "BOOL") { if(maxVal == 1) { - s = sample(2, no_of_res, TRUE); + s = sample(2, no_of_res, TRUE, seed); b = s - 1; OpParam[, j] = b; } @@ -400,25 +411,43 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Intege # extract the top k pipelines as a final result after deduplication and sorting -extractTopK = function(Frame[Unknown] pipeline, Matrix[Double] hyperparam, - Double baseLineScore, Integer k) - return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams) +extractTopK = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, + Double baseLineScore, Integer k, Frame[Unknown] mainLookup) + return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy) { - hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE) - pipeline = frameSort(pipeline, cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(pipeline) - 1)), TRUE) + IX = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=TRUE) + P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(hyperparam)); + hyperparam = P %*% hyperparam + pipeline = P %*% pipeline + # remove the row with accuracy less than test accuracy mask = (hyperparam[, 1] < baseLineScore) == 0 + if(sum(mask) == 0) + mask[1, 1] = 1 hyperparam = removeEmpty(target = hyperparam, margin = "rows", select = mask) + pipeline = removeEmpty(target = pipeline, margin = "rows", select = mask) + rowIndex = min(nrow(hyperparam), k) # select the top k - bestPipeline = pipeline[1:rowIndex,] - bestHyperparams = hyperparam[1:rowIndex,] + bestAccuracy = pipeline[1:rowIndex, 1] + bestHyperparams = hyperparam[1:rowIndex, 2:ncol(hyperparam)] + pipeline = pipeline[1:rowIndex] + # # # lookup for the pipelines + pipCode = pipeline[, ncol(pipeline)] + + bestPipeline = frame(data="0", rows=nrow(pipeline), cols=ncol(mainLookup)) + parfor(i in 1: nrow(pipeline)) { + index = as.scalar(pipCode[i]) + bestPipeline[i] = mainLookup[index] + } + + bestPipeline = bestPipeline[, 2:ncol(bestPipeline)] + } # extract the top k pipelines for each bracket, the intermediate results -extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, - Integer k, Frame[String] conf) - return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams) +extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, Integer k) + return (Matrix[Double] bestPipeline, Matrix[Double] bestHyperparams) { # bestPipeline = frameSort(bestPipeline) hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE) @@ -427,12 +456,7 @@ extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperpa pipeline = pipeline[1:rowIndex,] bestHyperparams = hyperparam[1:rowIndex,] - bestPipeline = frame(data="0", rows=nrow(pipeline), cols=ncol(conf)) - parfor(i in 1: nrow(pipeline)) { - index = as.scalar(pipeline[i, 3]) - bestPipeline[i] = conf[index] - bestPipeline[i, 1] = as.frame(pipeline[i, 1]) - } + bestPipeline = pipeline[1:rowIndex] } ########################################################################### diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml index 7226b761b49..a606df9a465 100644 --- a/scripts/builtin/executePipeline.dml +++ b/scripts/builtin/executePipeline.dml @@ -375,8 +375,8 @@ SMOTE = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, Integ return (Matrix[Double] X, Matrix[Double] Y) { # get the class count - for(k in 1:max(Y)) { - classes = table(Y, 1) + classes = table(Y, 1) + for(k in 1:nrow(classes) - 1) { minClass = min(classes) maxClass = max(classes) diff = (maxClass - minClass)/sum(classes) diff --git a/scripts/builtin/msvm.dml b/scripts/builtin/msvm.dml index 477528866d7..921d694e447 100644 --- a/scripts/builtin/msvm.dml +++ b/scripts/builtin/msvm.dml @@ -53,7 +53,12 @@ m_msvm = function(Matrix[Double] X, Matrix[Double] Y, Boolean intercept = FALSE, stop("MSVM: Invalid Y input, containing negative values") if(verbose) print("Running Multiclass-SVM") - + # Robustness for datasets with missing values (causing NaN gradients) + numNaNs = sum(isNaN(X)) + if( numNaNs > 0 ) { + print("msvm: matrix X contains "+numNaNs+" missing values, replacing with 0.") + X = replace(target=X, pattern=NaN, replacement=0); + } num_rows_in_w = ncol(X) if(intercept) { # append once, and call l2svm always with intercept=FALSE diff --git a/scripts/builtin/msvmPredict.dml b/scripts/builtin/msvmPredict.dml index 4c7460ffdb7..d34d086eddf 100644 --- a/scripts/builtin/msvmPredict.dml +++ b/scripts/builtin/msvmPredict.dml @@ -41,6 +41,12 @@ m_msvmPredict = function(Matrix[Double] X, Matrix[Double] W) return(Matrix[Double] YRaw, Matrix[Double] Y) { + # Robustness for datasets with missing values + numNaNs = sum(isNaN(X)) + if( numNaNs > 0 ) { + print("msvm: matrix X contains "+numNaNs+" missing values, replacing with 0.") + X = replace(target=X, pattern=NaN, replacement=0); + } if(ncol(X) != nrow(W)){ if(ncol(X) + 1 != nrow(W)){ stop("MSVM Predict: Invalid shape of W ["+ncol(W)+","+nrow(W)+"] or X ["+ncol(X)+","+nrow(X)+"]") diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index e6739d8bca4..1b983faa0bf 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -57,8 +57,9 @@ source("scripts/pipelines/scripts/enumerateLogical.dml") as lg; source("scripts/builtin/bandit.dml") as bandit; s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives, - Frame[Unknown] parameters, String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, Double sample = 1.0, - Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE, String output) + Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, + Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE, + String output) return(Boolean perf) # return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Frame[Unknown] bestLogical, # Frame[Unknown] features, Double dirtyScore, Matrix[Double] evalFunHp) @@ -138,18 +139,17 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a metaList['distY'] = dist print("-- Cleaning - Enum Logical Pipelines: "); - [bestLogical, con, refChanges] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, - initial_population=logical, seed = seed, max_iter=max_iter, metaList = metaList, + [bestLogical, con, refChanges, acc] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, + initial_population=logical, refSol=refSol, seed = seed, max_iter=max_iter, metaList = metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters, dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx) t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s"); - topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); features = as.frame("NULL") # # [topKPipelines, topKHyperParams, topKScores, features] = perf = bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, primitives=primitives, param=parameters, baseLineScore=dirtyScore, - k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, enablePruning = enablePruning, output=output, verbose=TRUE); + k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, enablePruning = enablePruning, output=output, verbose=TRUE); t7 = time(); print("-- Cleaning - Enum Physical Pipelines: "+(t7-t6)/1e9+"s"); } diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml index 35e5def4b04..4e4d436f4ce 100644 --- a/scripts/pipelines/scripts/enumerateLogical.dml +++ b/scripts/pipelines/scripts/enumerateLogical.dml @@ -52,32 +52,30 @@ source("scripts/builtin/bandit.dml") as bandit; enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] Xtest, Matrix[Double] ytest, - Frame[Unknown] initial_population, Integer seed = -1, Integer max_iter=10, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, + Frame[Unknown] initial_population, Frame[String] refSol = as.frame("NaN"), Integer seed = -1, Integer max_iter=10, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] primitives, Frame[Unknown] param, Double dirtyScore = 79, Boolean cv=FALSE, Boolean cvk=3, Boolean verbose, List[Unknown] ctx=list(prefix="----")) -return (Frame[Unknown] output, boolean converged, Double refChanges) +return (Frame[Unknown] output, boolean converged, Double refChanges, Frame[Unknown] acc) { finalOutput = list() mask = as.matrix(metaList['mask']) - num_exec = 1 prefix = as.scalar(ctx["prefix"]); iter = 1 populationLength = 0 converged = FALSE start = 1; end = 0; - [allOps, ref] = getOps(param[, 2], as.scalar(metaList['distY']), nrow(y), min(y)) + [allOps, ref] = getOps(param[, 2], refSol, as.scalar(metaList['distY']), nrow(y), min(y)) # unrolled by physical pipelines pipelines = frame(0, rows=nrow(primitives)^ncol(primitives), cols=max(ncol(initial_population), ncol(ref))) - parfor(i in 1:nrow(initial_population), check = 0) { + for(i in 1:nrow(initial_population)) { pconf = bandit::get_physical_configurations(initial_population[i], 0, primitives) end = end + nrow(pconf) pipelines[start:end, 1:ncol(pconf)] = pconf start = end + 1 } - pipelines = removeEmpty(target = pipelines, margin="rows") if(sum(mask) > 0) { @@ -94,11 +92,11 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) while(!converged & iter <= max_iter) { populationLength = max(populationLength, ncol(population)) - id = seq(1, nrow(population)) + id = matrix(seq(1, nrow(population)*2), rows=nrow(population), cols=2) print(prefix+" EnumLP iteration "+iter+"/"+as.integer(max_iter)+":" ); # # # execute the physical pipelines - [outPip, outHp, p, refChanges] = bandit::run_with_hyperparam(cbind(as.frame(id), population), - num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp, param, cv, cvk, 0, FALSE, TRUE) + [outPip, outHp, p, refChanges] = bandit::run_with_hyperparam(ph_pip=cbind(as.frame(id), population), + X=X, Y=y, Xtest=Xtest, Ytest=ytest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, seed=seed, default=TRUE) # # sort the configurations score-wise actPip = cbind(as.frame(outPip[, 1]), as.frame(refChanges)) actPip = cbind(actPip, population) @@ -115,7 +113,7 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) children = frame(0, rows=populationSize, cols=ncol(sortedPipelines)) sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)] # # randomly pick the pipelines for transitions - pipRand = sample(nrow(sortedPipelines), populationSize, TRUE) + pipRand = sample(nrow(sortedPipelines), populationSize, TRUE, seed) if(!converged) { parfor(i in 1:nrow(children), check=0) { idxR = (nrow(children) * (iter - 1)) + i @@ -129,7 +127,7 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) if(random == 1) c1 = addition(top, allOps[as.scalar(opToAdd[idxR])]) else if(random == 2) - c1 = mutation(top) + c1 = mutation(top, seed) else if(random == 3) c1 = removal(top, as.scalar(opToRemove[idxR])) @@ -159,22 +157,30 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) output = removeEmpty(target=output, margin="rows") output = frameSort(output, sort_mask, FALSE) refChanges = as.double(as.scalar(output[nrow(output), 2])) - halfing = max(floor(nrow(output)/2), 1) - output = output[halfing:nrow(output), 3:ncol(output)] + # halfing = max(floor(nrow(output)/2), 1) + acc = output[, 1] + # print("# of logical pipelines: "+nrow(output)) + # print("max accuracy: "+toString(acc)) + output = output[,3:ncol(output)] + # print("logical pipelines: "+toString(output)) } -addition = function(Frame[Unknown] top, Frame[Unknown] allOps) -return (Frame [Unknown] child) +addition = function(Frame[Unknown] top, Frame[Unknown] opToAdd) +return (Frame[Unknown] child) { - child = cbind(allOps, top) + # # # never add same operation adjacent to each other + if(as.scalar(top[1,1]) != as.scalar(opToAdd[1,1])) + child = cbind(opToAdd, top) + else + child = cbind(top, opToAdd) } -mutation = function(Frame[Unknown] child) +mutation = function(Frame[Unknown] child, Integer seed) return (Frame [Unknown] mChild) { if(ncol(child) >= 2) { - r = sample(ncol(child), 2) + r = sample(ncol(child), 2, FALSE, seed) r1 = as.scalar(r[1,1]) r2 = as.scalar(r[2,1]) temp = child[1, r1] @@ -195,8 +201,8 @@ return (Frame[Unknown] child) } } -getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer minValue) - return (Frame[String] allOps, Frame[String] ref) { +getOps = function( Frame[string] allOps, Frame[String] refSol, Integer dist, Integer n, Integer minValue) + return (Frame[String] allOps, Frame[String] refSol) { # # # TODO fix the following hard-coded condition by taking a file input # # allOps are the operation which are randomly added to a population, for now I am reusing param file @@ -214,5 +220,7 @@ getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer minVal # & !x.equals(\"mice\") & !x.equals(\"dbscan\") ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3) } + if(as.scalar(refSol[1,1]) == "NaN") + refSol = ref allOps = removeEmpty(target=allOps, margin="rows") } \ No newline at end of file diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml index 4a9df756458..f6060e6a0f5 100644 --- a/scripts/pipelines/scripts/utils.dml +++ b/scripts/pipelines/scripts/utils.dml @@ -202,14 +202,12 @@ return(Frame[Unknown] data) # step 2 fix invalid lengths # q0 = 0.05 # q1 = 0.95 - # print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile"); - # [train, mask, qlow, qup] = fixInvalidLengths(train, mask, q0, q1) + # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1) # step 3 fix swap values - # print(prefix+" value swap fixing"); - # train = valueSwap(train, schema) + # data = valueSwap(data, schema) # step 3 drop invalid types data = dropInvalidType(data, schema) @@ -236,17 +234,4 @@ return(Frame[Unknown] data) data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2) } } -} - - - - - - - - - - - - - +} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv index 714d5357b5d..d3961daae3c 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv @@ -1,3 +1,3 @@ -imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply -imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply -imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply +winsorizeApply,NA,imputeByMedianApply,dummycodingApply,0 +imputeByFdApply,winsorizeApply,imputeByMedianApply,NA,dummycodingApply +NA,imputeByMedianApply,dummycodingApply,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv index ba11a229c21..cb6a8ae3001 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv @@ -1,3 +1,3 @@ -68.84057971014492 -68.84057971014492 -68.65942028985508 +88.28828828828829 +88.28828828828829 +87.38738738738738 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv index 4e5b1a5042c..d70d1d19535 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv @@ -1 +1 @@ -61.050724637681164 \ No newline at end of file +71.17117117117117 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv index 5dba366c69e..5e02cbb7d5f 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv @@ -1,3 +1,3 @@ -32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.017866171174338655,0.9722754538748367,0,0,0,1.0,0,2.0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.04096822615526508,0.9724536097500497,0,0,0,1.0,0,2.0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.04922407814925073,0.973233625102309,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +32.0,2.0,0.04153685307086108,0.9894210663385763,0,0,0,1.0,0,1.0,0.49421066338576347,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +40.0,1.0,0.8365263980314581,0,0,1.0,0,0,1.0,2.0,0.04153685307086108,0.9894210663385763,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,1.0,0.49421066338576347,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +21.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv index 3738bc56fd1..f1492ae0577 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv @@ -1,3 +1,3 @@ -imputeByMean,winsorize,scale,dummycoding -imputeByMean,winsorize,scale,dummycoding -imputeByMean,winsorize,scale,dummycoding +winsorize,underSampling,imputeByMedian,dummycoding,0 +imputeByFd,winsorize,imputeByMedian,underSampling,dummycoding +underSampling,imputeByMedian,dummycoding,0,0 diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml index 56a82c8bec9..3ccb2c0f7b9 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml @@ -57,9 +57,9 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)] # # # split in train/test 70/30 # [topKPipelines, topKHyperParams, topKScores, bestLogical, features, dirtyScore, evalHp] = -result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, +result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, - expectedIncrease=expectedIncrease, seed = 42, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) + expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) write(result, $O) @@ -101,4 +101,45 @@ return(Matrix[Double] output, Matrix[Double] error) accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE); err = as.matrix(1-(acc/100)); +} + + +evalClassificationMSVM = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), + Matrix[Double] evalFunHp) +return(Matrix[Double] output, Matrix[Double] error) +{ + if(is.na(as.scalar(evalFunHp[1,1]))) + { + nc = max(Y); + params = list("intercept", "lambda", "epsilon") + paramRanges = list(seq(0, 1), 10^seq(1,-3), 10^seq(1,-5)); + trainArgs = list(X=X, Y=Y, intercept=-1, lambda=-1, epsilon=-1, maxIterations=1000, verbose=FALSE); + dataArgs = list("X", "Y"); + [B1,opt] = gridSearch(X=X, y=Y, train="msvm", predict="accuracyMSVM", numB=(ncol(X)+1)*(nc), + params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=TRUE); + evalFunHp = as.matrix(opt) # opt # + # opt = matrix("2 10 0.001", rows=1, cols=3) + # evalFunHp = opt + } + if(min(Y) == max(Y)) + { + accuracy = as.matrix(0) + a = 0 + } + else { + beta = msvm(X=X, Y=Y, intercept=as.scalar(evalFunHp[1,1]), lambda=as.scalar(evalFunHp[1,2]), epsilon=as.scalar(evalFunHp[1,3]), + maxIterations=1000, verbose=FALSE); + yhat = msvmPredict(X=Xtest, W=beta); + yhat = rowIndexMax(yhat) + accuracy = mean(yhat == Ytest) * 100 + error = yhat != Ytest + accuracy = as.matrix(accuracy) + } + output = cbind(accuracy, evalFunHp) +} +accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { + yhat = msvmPredict(X=X, W=B); + yhat = rowIndexMax(yhat) + acc = mean(yhat == y) + err = as.matrix(1-(acc)); } \ No newline at end of file