diff --git a/scripts/builtin/bandit.dml b/scripts/builtin/bandit.dml index d01755603e2..299b404e3e4 100644 --- a/scripts/builtin/bandit.dml +++ b/scripts/builtin/bandit.dml @@ -53,7 +53,7 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Double] X_test, Matrix[Double] Y_test, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] lp, Frame[Unknown] primitives, Frame[Unknown] param, Integer k = 3, - Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref = 0, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="") + Integer R=50, Double baseLineScore, Boolean cv, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean verbose = TRUE, String output="") return(Boolean perf) # return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy, Frame[String] feaFrameOuter) { @@ -61,38 +61,47 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl totalPruneCount = 0 FLAG_VARIABLE = 5 pipelines_executed = 0 - HYPERPARAM_LENGTH = (ncol(lp) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col + HYPERPARAM_LENGTH = ((ncol(lp) + 2) * FLAG_VARIABLE * 3) + 1 ## num of col in logical * 5 meat flag vars * max hyperparam per op + 1 accuracy col bestPipeline = frame("", rows=1, cols=1) bestHyperparams = as.matrix(0) bestAccuracy = as.matrix(0) # initialize bandit variables # variable names follow publication where algorithm is introduced eta = 2 # the halving ratio is fixed to 2 - s_max = floor(log(R,eta)) - 1; + s_max = floor(log(R,eta)); + # # compute weights for R and then increase/decrease R with respect to importance of configurations + + weight = matrix(1/s_max , rows=s_max, cols=1) + weight = cumsum(weight) + # weight = matrix(1, rows=s_max, cols=1) + # print("weight matrix: "+toString(weight)) # initialize output variables - hparam = matrix(0, rows=k*(s_max+1), cols=HYPERPARAM_LENGTH) - pipeline = frame(0, rows=k*(s_max+1), cols=ncol(lp)+1) - endIdx = matrix(k, rows=(s_max+1), cols=1) + hparam = matrix(0, rows=k*(s_max), cols=HYPERPARAM_LENGTH) + pipeline = matrix(0, rows=k*(s_max), cols=4) + endIdx = matrix(k, rows=(s_max), cols=1) endIdx = cumsum(endIdx) startIdx = (endIdx - k) + 1 - n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/(s_max + 1));) - - for(s in s_max:0) { # TODO convert to parfor + n = ifelse(s_max >= nrow(lp), nrow(lp), n = ceil(nrow(lp)/s_max);) + pipelineId = as.frame(seq(1, nrow(lp))) + lp = cbind(pipelineId, lp) + mainLookup = lp + B = (s_max + 1) * R; + s_max = s_max - 1 + idx = 1 + for(s in s_max:0, check = 0) { # TODO convert to parfor # result variables bracket_hp = matrix(0, rows=k*(s+1)+k, cols=HYPERPARAM_LENGTH) - bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=3) + bracket_pipel = matrix(0, rows=k*(s+1)+k, cols=4) start=1; end=0; - # # compute the number of initial pipelines n - r = R * eta^(-s); + r = max(R * as.scalar(weight[((s_max - s) + 1)]) * eta^(-s), 1); configurations = lp[1:(min(n, nrow(lp)))] + # append configuration keys for extracting the pipeline later on id = seq(1, nrow(configurations)) configurations = cbind(as.frame(id), configurations) - # save the original configuration as a lookup table - lookup = configurations for(i in 0:s) { # successive halving @@ -107,11 +116,13 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl configurations = configurations[1:n_i, ] pipelines_executed = pipelines_executed + (n_i * r_i) [outPip,outHp, pruneCount] = run_with_hyperparam(ph_pip=configurations, r_i=r_i, X=X_train, Y=Y_train, Xtest=X_test, Ytest=Y_test, metaList=metaList, - evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, enablePruning=enablePruning) + evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, ref=ref, seed = seed, enablePruning=enablePruning) totalPruneCount = totalPruneCount + pruneCount # sort the pipelines by order of accuracy decreasing - a = order(target = outPip, by = 1, decreasing=TRUE, index.return=FALSE) - b = order(target = outHp, by = 1, decreasing=TRUE, index.return=FALSE) + IX = order(target = outPip, by = 1, decreasing=TRUE, index.return=TRUE) + P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(outPip)); + a = P %*% outPip + b = P %*% outHp rowIndex = min(k, nrow(a)) # maintain the brackets results @@ -122,8 +133,9 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl # sort the configurations for successive halving avergae_perf = getMaxPerConf(outPip, nrow(configurations)) - sortMask = matrix(1, rows=1, cols=ncol(configurations)) - configurations = frameSort(cbind(avergae_perf, configurations), cbind(as.matrix(0), sortMask), TRUE) + sortMask = matrix(1, rows=1, cols=ncol(configurations) + 1) + sortMask[1,1] = 0 + configurations = frameSort(cbind(avergae_perf, configurations), sortMask, TRUE) configurations = configurations[, 2:ncol(configurations)] } if(n < nrow(lp)) @@ -131,17 +143,17 @@ m_bandit = function(Matrix[Double] X_train, Matrix[Double] Y_train, Matrix[Doubl bracket_pipel = removeEmpty(target=bracket_pipel, margin="rows") bracket_hp = removeEmpty(target=bracket_hp, margin="rows") # keep the best k results for each bracket - [bracket_bestPipeline, bracket_bestHyperparams] = extractBracketWinners(bracket_pipel, bracket_hp, k, lookup) + [bracket_bestPipeline, bracket_bestHyperparams] = extractBracketWinners(bracket_pipel, bracket_hp, k) # optimize by the features - startOut = as.scalar(startIdx[s+1]) - endOut = min(as.scalar(endIdx[s+1]), (startOut + nrow(bracket_bestPipeline) - 1)) + startOut = as.scalar(startIdx[idx]) + endOut = min(as.scalar(endIdx[idx]), (startOut + nrow(bracket_bestPipeline) - 1)) pipeline[startOut:endOut, ] = bracket_bestPipeline hparam[startOut:endOut, 1:ncol(bracket_bestHyperparams)] = bracket_bestHyperparams + idx = idx + 1 + # print("bracket best: \n"+toString(bracket_bestPipeline)) } - [bestPipeline, bestHyperparams] = extractTopK(pipeline, hparam, baseLineScore, k) - bestAccuracy = as.matrix(bestPipeline[,1]) - bestHyperparams = bestHyperparams[,2:ncol(bestHyperparams)] - bestPipeline = bestPipeline[, 2:ncol(bestPipeline)] + [bestPipeline, bestHyperparams, bestAccuracy] = extractTopK(pipeline, hparam, baseLineScore, k, mainLookup) + imp = as.double(as.scalar(bestAccuracy[1, 1])) - as.double(baseLineScore) perf = imp > 0 applyFunc = bestPipeline @@ -212,31 +224,32 @@ get_physical_configurations = function(Frame[String] logical, Scalar[int] numCon } # this method will call the execute pipelines with their hyper-parameters -run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double] X, Matrix[Double] Y, +run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i = 1, Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, - Frame[Unknown] param, Boolean cv, Integer cvk = 2, Double ref = 0, Boolean enablePruning = FALSE, Boolean default = FALSE) + Frame[Unknown] param, Boolean cv = FALSE, Integer cvk = 2, Double ref = 0, Integer seed = -1, Boolean enablePruning = FALSE, Boolean default = FALSE) return (Matrix[Double] output_operator, Matrix[Double] output_hyperparam, Integer pruneCount, Matrix[Double] changesByPipMatrix) { + # # # TODO there is a partial overlap but it is negligible so we will not rewrite the scripts but lineage based reuse will get rid of it changesByPipMatrix = matrix(0, rows=nrow(ph_pip) * r_i, cols=1) pruneCount = 0 - output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)-1) * 5 * 3) + output_hp = matrix(0, nrow(ph_pip)*r_i, (ncol(ph_pip)) * 5 * 3) output_accuracy = matrix(0, nrow(ph_pip)*r_i, 1) - output_pipelines = matrix(0, nrow(ph_pip)*r_i, 2) + output_pipelines = matrix(0, nrow(ph_pip)*r_i, 3) # rows in validation set clone_X = X clone_Y = Y clone_Xtest = Xtest clone_Ytest = Ytest index = 1 - id = as.matrix(ph_pip[, 1]) - ph_pip = ph_pip[, 2:ncol(ph_pip)] + ids = as.matrix(ph_pip[, 1:2]) + ph_pip = ph_pip[, 3:ncol(ph_pip)] parfor(i in 1:nrow(ph_pip), check = 0) { # execute configurations with r resources op = removeEmpty(target=ph_pip[i], margin="cols") - print("PIPELINE EXECUTION START ... "+toString(op)) - [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, param, r_i, default, enablePruning) + # print("PIPELINE EXECUTION START ... "+toString(op)) + [hp, applyFunctions, no_of_res, no_of_flag_vars] = getHyperparam(op, param, r_i, default, seed, enablePruning) hpForPruning = matrix(0, rows=1, cols=ncol(op)) changesByOp = matrix(0, rows=1, cols=ncol(op)) metaList2 = metaList; #ensure metaList is no result var @@ -279,13 +292,12 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double } # evalFunOutput = eval(evaluationFunc, argList) - accT = floor((time() - t1) / 1e+6) matrix_width = as.matrix(nrow(hp_matrix) * ncol(hp_matrix)) hp_vec = cbind(matrix_width, matrix(hp_matrix, rows=1, cols=nrow(hp_matrix)*ncol(hp_matrix), byrow=TRUE)) index = (i - 1) * no_of_res + r output_accuracy[index, 1] = accuracy output_hp[index, 1:ncol(hp_vec)] = hp_vec - output_pipelines[index, ] = cbind(as.matrix(index), id[i,1]) + output_pipelines[index, ] = cbind(as.matrix(index), ids[i,1:2]) } else { @@ -303,10 +315,9 @@ run_with_hyperparam = function(Frame[Unknown] ph_pip, Integer r_i, Matrix[Double } # extract the hyper-parameters for pipelines -getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Integer no_of_res, Boolean default, Boolean enablePruning) +getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Integer no_of_res, Boolean default, Integer seed = -1, Boolean enablePruning) return (Matrix[Double] paramMatrix, Frame[Unknown] applyFunc, Integer no_of_res, Integer NUM_META_FLAGS) { - allParam = 0; NUM_META_FLAGS = 5 NUM_DEFAULT_VALUES = 4 @@ -330,7 +341,7 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Intege # this matrix stores no. of hps, values of hps, and flags paramMatrix = matrix(0, rows=ncol(pipeline)*no_of_res, cols=max(paramCount)+NUM_META_FLAGS+1) - for(i in 1:ncol(pipeline)) { + parfor(i in 1:ncol(pipeline), check=0) { op = as.scalar(pipeline[1, i]) index = as.scalar(indexes[i]) no_of_param = as.integer(as.scalar(paramCount[i])) @@ -354,21 +365,21 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Intege minVal = as.scalar(hpList[index, paramValIndex]) maxVal = as.scalar(hpList[index, paramValIndex + 1]) if(type == "FP") { - val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform"); + val = rand(rows=no_of_res, cols=1, min=minVal, max=maxVal, pdf="uniform", seed=seed); OpParam[, j] = val; } else if(type == "INT") { if(as.integer(maxVal) > no_of_res) - val = sample(as.integer(maxVal), no_of_res, FALSE) + val = sample(as.integer(maxVal), no_of_res, FALSE, seed) else - val = sample(as.integer(maxVal), no_of_res, TRUE) + val = sample(as.integer(maxVal), no_of_res, TRUE, seed) less_than_min = val < as.integer(minVal); val = (less_than_min * minVal) + val; OpParam[, j] = val; } else if(type == "BOOL") { if(maxVal == 1) { - s = sample(2, no_of_res, TRUE); + s = sample(2, no_of_res, TRUE, seed); b = s - 1; OpParam[, j] = b; } @@ -400,25 +411,43 @@ getHyperparam = function(Frame[Unknown] pipeline, Frame[Unknown] hpList, Intege # extract the top k pipelines as a final result after deduplication and sorting -extractTopK = function(Frame[Unknown] pipeline, Matrix[Double] hyperparam, - Double baseLineScore, Integer k) - return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams) +extractTopK = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, + Double baseLineScore, Integer k, Frame[Unknown] mainLookup) + return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams, Matrix[Double] bestAccuracy) { - hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE) - pipeline = frameSort(pipeline, cbind(as.matrix(0), matrix(1, rows=1, cols=ncol(pipeline) - 1)), TRUE) + IX = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=TRUE) + P = table(seq(1,nrow(IX)), IX, nrow(IX), nrow(hyperparam)); + hyperparam = P %*% hyperparam + pipeline = P %*% pipeline + # remove the row with accuracy less than test accuracy mask = (hyperparam[, 1] < baseLineScore) == 0 + if(sum(mask) == 0) + mask[1, 1] = 1 hyperparam = removeEmpty(target = hyperparam, margin = "rows", select = mask) + pipeline = removeEmpty(target = pipeline, margin = "rows", select = mask) + rowIndex = min(nrow(hyperparam), k) # select the top k - bestPipeline = pipeline[1:rowIndex,] - bestHyperparams = hyperparam[1:rowIndex,] + bestAccuracy = pipeline[1:rowIndex, 1] + bestHyperparams = hyperparam[1:rowIndex, 2:ncol(hyperparam)] + pipeline = pipeline[1:rowIndex] + # # # lookup for the pipelines + pipCode = pipeline[, ncol(pipeline)] + + bestPipeline = frame(data="0", rows=nrow(pipeline), cols=ncol(mainLookup)) + parfor(i in 1: nrow(pipeline)) { + index = as.scalar(pipCode[i]) + bestPipeline[i] = mainLookup[index] + } + + bestPipeline = bestPipeline[, 2:ncol(bestPipeline)] + } # extract the top k pipelines for each bracket, the intermediate results -extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, - Integer k, Frame[String] conf) - return (Frame[Unknown] bestPipeline, Matrix[Double] bestHyperparams) +extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperparam, Integer k) + return (Matrix[Double] bestPipeline, Matrix[Double] bestHyperparams) { # bestPipeline = frameSort(bestPipeline) hyperparam = order(target = hyperparam, by = 1, decreasing=TRUE, index.return=FALSE) @@ -427,12 +456,7 @@ extractBracketWinners = function(Matrix[Double] pipeline, Matrix[Double] hyperpa pipeline = pipeline[1:rowIndex,] bestHyperparams = hyperparam[1:rowIndex,] - bestPipeline = frame(data="0", rows=nrow(pipeline), cols=ncol(conf)) - parfor(i in 1: nrow(pipeline)) { - index = as.scalar(pipeline[i, 3]) - bestPipeline[i] = conf[index] - bestPipeline[i, 1] = as.frame(pipeline[i, 1]) - } + bestPipeline = pipeline[1:rowIndex] } ########################################################################### diff --git a/scripts/builtin/executePipeline.dml b/scripts/builtin/executePipeline.dml index 7226b761b49..a606df9a465 100644 --- a/scripts/builtin/executePipeline.dml +++ b/scripts/builtin/executePipeline.dml @@ -375,8 +375,8 @@ SMOTE = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] mask, Integ return (Matrix[Double] X, Matrix[Double] Y) { # get the class count - for(k in 1:max(Y)) { - classes = table(Y, 1) + classes = table(Y, 1) + for(k in 1:nrow(classes) - 1) { minClass = min(classes) maxClass = max(classes) diff = (maxClass - minClass)/sum(classes) diff --git a/scripts/builtin/msvm.dml b/scripts/builtin/msvm.dml index 477528866d7..921d694e447 100644 --- a/scripts/builtin/msvm.dml +++ b/scripts/builtin/msvm.dml @@ -53,7 +53,12 @@ m_msvm = function(Matrix[Double] X, Matrix[Double] Y, Boolean intercept = FALSE, stop("MSVM: Invalid Y input, containing negative values") if(verbose) print("Running Multiclass-SVM") - + # Robustness for datasets with missing values (causing NaN gradients) + numNaNs = sum(isNaN(X)) + if( numNaNs > 0 ) { + print("msvm: matrix X contains "+numNaNs+" missing values, replacing with 0.") + X = replace(target=X, pattern=NaN, replacement=0); + } num_rows_in_w = ncol(X) if(intercept) { # append once, and call l2svm always with intercept=FALSE diff --git a/scripts/builtin/msvmPredict.dml b/scripts/builtin/msvmPredict.dml index 4c7460ffdb7..d34d086eddf 100644 --- a/scripts/builtin/msvmPredict.dml +++ b/scripts/builtin/msvmPredict.dml @@ -41,6 +41,12 @@ m_msvmPredict = function(Matrix[Double] X, Matrix[Double] W) return(Matrix[Double] YRaw, Matrix[Double] Y) { + # Robustness for datasets with missing values + numNaNs = sum(isNaN(X)) + if( numNaNs > 0 ) { + print("msvm: matrix X contains "+numNaNs+" missing values, replacing with 0.") + X = replace(target=X, pattern=NaN, replacement=0); + } if(ncol(X) != nrow(W)){ if(ncol(X) + 1 != nrow(W)){ stop("MSVM Predict: Invalid shape of W ["+ncol(W)+","+nrow(W)+"] or X ["+ncol(X)+","+nrow(X)+"]") diff --git a/scripts/builtin/topk_cleaning.dml b/scripts/builtin/topk_cleaning.dml index e6739d8bca4..1b983faa0bf 100644 --- a/scripts/builtin/topk_cleaning.dml +++ b/scripts/builtin/topk_cleaning.dml @@ -57,8 +57,9 @@ source("scripts/pipelines/scripts/enumerateLogical.dml") as lg; source("scripts/builtin/bandit.dml") as bandit; s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = as.frame("NULL"), Frame[Unknown] metaData = as.frame("NULL"), Frame[Unknown] primitives, - Frame[Unknown] parameters, String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, Double sample = 1.0, - Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE, String output) + Frame[Unknown] parameters, Frame[String] refSol = as.frame("NaN"), String evaluationFunc, Matrix[Double] evalFunHp, Integer topK = 5, Integer resource_val = 20, Integer max_iter = 10, + Double sample = 1.0, Double expectedIncrease=1.0, Integer seed = -1, Boolean cv=TRUE, Integer cvk = 2, Boolean isLastLabel = TRUE, Boolean correctTypos=FALSE, Boolean enablePruning = FALSE, + String output) return(Boolean perf) # return (Frame[Unknown] topKPipelines, Matrix[Double] topKHyperParams, Matrix[Double] topKScores, Frame[Unknown] bestLogical, # Frame[Unknown] features, Double dirtyScore, Matrix[Double] evalFunHp) @@ -138,18 +139,17 @@ s_topk_cleaning = function(Frame[Unknown] dataTrain, Frame[Unknown] dataTest = a metaList['distY'] = dist print("-- Cleaning - Enum Logical Pipelines: "); - [bestLogical, con, refChanges] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, - initial_population=logical, seed = seed, max_iter=max_iter, metaList = metaList, + [bestLogical, con, refChanges, acc] = lg::enumerateLogical(X=eXtrain, y=eYtrain, Xtest=eXtest, ytest=eYtest, + initial_population=logical, refSol=refSol, seed = seed, max_iter=max_iter, metaList = metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, primitives=primitives, param=parameters, dirtyScore = (dirtyScore + expectedIncrease), cv=cv, cvk=cvk, verbose=TRUE, ctx=ctx) t6 = time(); print("---- finalized in: "+(t6-t5)/1e9+"s"); - topKPipelines = as.frame("NULL"); topKHyperParams = matrix(0,0,0); topKScores = matrix(0,0,0); features = as.frame("NULL") # # [topKPipelines, topKHyperParams, topKScores, features] = perf = bandit(X_train=eXtrain, Y_train=eYtrain, X_test=eXtest, Y_test=eYtest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, lp=bestLogical, primitives=primitives, param=parameters, baseLineScore=dirtyScore, - k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, enablePruning = enablePruning, output=output, verbose=TRUE); + k=topK, R=resource_val, cv=cv, cvk=cvk, ref=refChanges, seed=seed, enablePruning = enablePruning, output=output, verbose=TRUE); t7 = time(); print("-- Cleaning - Enum Physical Pipelines: "+(t7-t6)/1e9+"s"); } diff --git a/scripts/pipelines/scripts/enumerateLogical.dml b/scripts/pipelines/scripts/enumerateLogical.dml index 35e5def4b04..4e4d436f4ce 100644 --- a/scripts/pipelines/scripts/enumerateLogical.dml +++ b/scripts/pipelines/scripts/enumerateLogical.dml @@ -52,32 +52,30 @@ source("scripts/builtin/bandit.dml") as bandit; enumerateLogical = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] Xtest, Matrix[Double] ytest, - Frame[Unknown] initial_population, Integer seed = -1, Integer max_iter=10, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, + Frame[Unknown] initial_population, Frame[String] refSol = as.frame("NaN"), Integer seed = -1, Integer max_iter=10, List[Unknown] metaList, String evaluationFunc, Matrix[Double] evalFunHp, Frame[Unknown] primitives, Frame[Unknown] param, Double dirtyScore = 79, Boolean cv=FALSE, Boolean cvk=3, Boolean verbose, List[Unknown] ctx=list(prefix="----")) -return (Frame[Unknown] output, boolean converged, Double refChanges) +return (Frame[Unknown] output, boolean converged, Double refChanges, Frame[Unknown] acc) { finalOutput = list() mask = as.matrix(metaList['mask']) - num_exec = 1 prefix = as.scalar(ctx["prefix"]); iter = 1 populationLength = 0 converged = FALSE start = 1; end = 0; - [allOps, ref] = getOps(param[, 2], as.scalar(metaList['distY']), nrow(y), min(y)) + [allOps, ref] = getOps(param[, 2], refSol, as.scalar(metaList['distY']), nrow(y), min(y)) # unrolled by physical pipelines pipelines = frame(0, rows=nrow(primitives)^ncol(primitives), cols=max(ncol(initial_population), ncol(ref))) - parfor(i in 1:nrow(initial_population), check = 0) { + for(i in 1:nrow(initial_population)) { pconf = bandit::get_physical_configurations(initial_population[i], 0, primitives) end = end + nrow(pconf) pipelines[start:end, 1:ncol(pconf)] = pconf start = end + 1 } - pipelines = removeEmpty(target = pipelines, margin="rows") if(sum(mask) > 0) { @@ -94,11 +92,11 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) while(!converged & iter <= max_iter) { populationLength = max(populationLength, ncol(population)) - id = seq(1, nrow(population)) + id = matrix(seq(1, nrow(population)*2), rows=nrow(population), cols=2) print(prefix+" EnumLP iteration "+iter+"/"+as.integer(max_iter)+":" ); # # # execute the physical pipelines - [outPip, outHp, p, refChanges] = bandit::run_with_hyperparam(cbind(as.frame(id), population), - num_exec, X, y, Xtest, ytest, metaList, evaluationFunc, evalFunHp, param, cv, cvk, 0, FALSE, TRUE) + [outPip, outHp, p, refChanges] = bandit::run_with_hyperparam(ph_pip=cbind(as.frame(id), population), + X=X, Y=y, Xtest=Xtest, Ytest=ytest, metaList=metaList, evaluationFunc=evaluationFunc, evalFunHp=evalFunHp, param=param, cv=cv, cvk=cvk, seed=seed, default=TRUE) # # sort the configurations score-wise actPip = cbind(as.frame(outPip[, 1]), as.frame(refChanges)) actPip = cbind(actPip, population) @@ -115,7 +113,7 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) children = frame(0, rows=populationSize, cols=ncol(sortedPipelines)) sortedPipelines = sortedPipelines[, 3:ncol(sortedPipelines)] # # randomly pick the pipelines for transitions - pipRand = sample(nrow(sortedPipelines), populationSize, TRUE) + pipRand = sample(nrow(sortedPipelines), populationSize, TRUE, seed) if(!converged) { parfor(i in 1:nrow(children), check=0) { idxR = (nrow(children) * (iter - 1)) + i @@ -129,7 +127,7 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) if(random == 1) c1 = addition(top, allOps[as.scalar(opToAdd[idxR])]) else if(random == 2) - c1 = mutation(top) + c1 = mutation(top, seed) else if(random == 3) c1 = removal(top, as.scalar(opToRemove[idxR])) @@ -159,22 +157,30 @@ return (Frame[Unknown] output, boolean converged, Double refChanges) output = removeEmpty(target=output, margin="rows") output = frameSort(output, sort_mask, FALSE) refChanges = as.double(as.scalar(output[nrow(output), 2])) - halfing = max(floor(nrow(output)/2), 1) - output = output[halfing:nrow(output), 3:ncol(output)] + # halfing = max(floor(nrow(output)/2), 1) + acc = output[, 1] + # print("# of logical pipelines: "+nrow(output)) + # print("max accuracy: "+toString(acc)) + output = output[,3:ncol(output)] + # print("logical pipelines: "+toString(output)) } -addition = function(Frame[Unknown] top, Frame[Unknown] allOps) -return (Frame [Unknown] child) +addition = function(Frame[Unknown] top, Frame[Unknown] opToAdd) +return (Frame[Unknown] child) { - child = cbind(allOps, top) + # # # never add same operation adjacent to each other + if(as.scalar(top[1,1]) != as.scalar(opToAdd[1,1])) + child = cbind(opToAdd, top) + else + child = cbind(top, opToAdd) } -mutation = function(Frame[Unknown] child) +mutation = function(Frame[Unknown] child, Integer seed) return (Frame [Unknown] mChild) { if(ncol(child) >= 2) { - r = sample(ncol(child), 2) + r = sample(ncol(child), 2, FALSE, seed) r1 = as.scalar(r[1,1]) r2 = as.scalar(r[2,1]) temp = child[1, r1] @@ -195,8 +201,8 @@ return (Frame[Unknown] child) } } -getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer minValue) - return (Frame[String] allOps, Frame[String] ref) { +getOps = function( Frame[string] allOps, Frame[String] refSol, Integer dist, Integer n, Integer minValue) + return (Frame[String] allOps, Frame[String] refSol) { # # # TODO fix the following hard-coded condition by taking a file input # # allOps are the operation which are randomly added to a population, for now I am reusing param file @@ -214,5 +220,7 @@ getOps = function( Frame[string] allOps, Integer dist, Integer n, Integer minVal # & !x.equals(\"mice\") & !x.equals(\"dbscan\") ref = frame(["imputeByMean", "winsorize", "scale"], rows=1, cols=3) } + if(as.scalar(refSol[1,1]) == "NaN") + refSol = ref allOps = removeEmpty(target=allOps, margin="rows") } \ No newline at end of file diff --git a/scripts/pipelines/scripts/utils.dml b/scripts/pipelines/scripts/utils.dml index 4a9df756458..f6060e6a0f5 100644 --- a/scripts/pipelines/scripts/utils.dml +++ b/scripts/pipelines/scripts/utils.dml @@ -202,14 +202,12 @@ return(Frame[Unknown] data) # step 2 fix invalid lengths # q0 = 0.05 # q1 = 0.95 - # print(prefix+" fixing invalid lengths between "+q0+" and "+q1+" quantile"); - # [train, mask, qlow, qup] = fixInvalidLengths(train, mask, q0, q1) + # [data, mask, qlow, qup] = fixInvalidLengths(data, mask, q0, q1) # step 3 fix swap values - # print(prefix+" value swap fixing"); - # train = valueSwap(train, schema) + # data = valueSwap(data, schema) # step 3 drop invalid types data = dropInvalidType(data, schema) @@ -236,17 +234,4 @@ return(Frame[Unknown] data) data[, idx] = map(data[, idx], "x -> UtilFunctions.getTimestamp(x)", margin=2) } } -} - - - - - - - - - - - - - +} \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv index 714d5357b5d..d3961daae3c 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv @@ -1,3 +1,3 @@ -imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply -imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply -imputeByMeanApply,winsorizeApply,scaleApply,dummycodingApply +winsorizeApply,NA,imputeByMedianApply,dummycodingApply,0 +imputeByFdApply,winsorizeApply,imputeByMedianApply,NA,dummycodingApply +NA,imputeByMedianApply,dummycodingApply,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv index ba11a229c21..cb6a8ae3001 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv @@ -1,3 +1,3 @@ -68.84057971014492 -68.84057971014492 -68.65942028985508 +88.28828828828829 +88.28828828828829 +87.38738738738738 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv index 4e5b1a5042c..d70d1d19535 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv @@ -1 +1 @@ -61.050724637681164 \ No newline at end of file +71.17117117117117 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv index 5dba366c69e..5e02cbb7d5f 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv @@ -1,3 +1,3 @@ -32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.017866171174338655,0.9722754538748367,0,0,0,1.0,0,2.0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.04096822615526508,0.9724536097500497,0,0,0,1.0,0,2.0,0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -32.0,0,0,0,1.0,0,0,0,2.0,2.0,0.04922407814925073,0.973233625102309,0,0,0,1.0,0,2.0,1.0,1.0,0,0,0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +32.0,2.0,0.04153685307086108,0.9894210663385763,0,0,0,1.0,0,1.0,0.49421066338576347,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +40.0,1.0,0.8365263980314581,0,0,1.0,0,0,1.0,2.0,0.04153685307086108,0.9894210663385763,0,0,0,1.0,0,0,0,0,1.0,0,0,0,2.0,1.0,0.49421066338576347,0,0,0,1.0,0,2.0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +21.0,1.0,0.49421066338576347,0,0,1.0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv index 3738bc56fd1..f1492ae0577 100644 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv @@ -1,3 +1,3 @@ -imputeByMean,winsorize,scale,dummycoding -imputeByMean,winsorize,scale,dummycoding -imputeByMean,winsorize,scale,dummycoding +winsorize,underSampling,imputeByMedian,dummycoding,0 +imputeByFd,winsorize,imputeByMedian,underSampling,dummycoding +underSampling,imputeByMedian,dummycoding,0,0 diff --git a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml index 56a82c8bec9..3ccb2c0f7b9 100644 --- a/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml +++ b/src/test/scripts/functions/pipelines/topkcleaningClassificationTest.dml @@ -57,9 +57,9 @@ metaInfo = metaInfo[, 2:ncol(metaInfo)] # # # split in train/test 70/30 # [topKPipelines, topKHyperParams, topKScores, bestLogical, features, dirtyScore, evalHp] = -result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, +result = topk_cleaning(dataTrain=trainData, dataTest=testData, metaData=metaInfo, primitives=primitives, parameters=param, refSol = frame(["imputeByMean", "scale", "dummycoding"], rows=1, cols=3), evaluationFunc=evalFunc, evalFunHp=as.matrix(NaN),topK=topK, resource_val=resources, - expectedIncrease=expectedIncrease, seed = 42, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) + expectedIncrease=expectedIncrease, seed = 23, max_iter=max_iter, cv=testCV, cvk=cvk, sample=sample, isLastLabel=TRUE, correctTypos=FALSE, output=output) write(result, $O) @@ -101,4 +101,45 @@ return(Matrix[Double] output, Matrix[Double] error) accuracy = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { [M,yhat,acc] = multiLogRegPredict(X=X, B=B, Y=y, verbose=FALSE); err = as.matrix(1-(acc/100)); +} + + +evalClassificationMSVM = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double] Xtest, Matrix[Double] Ytest, Matrix[Double] Xorig=as.matrix(0), + Matrix[Double] evalFunHp) +return(Matrix[Double] output, Matrix[Double] error) +{ + if(is.na(as.scalar(evalFunHp[1,1]))) + { + nc = max(Y); + params = list("intercept", "lambda", "epsilon") + paramRanges = list(seq(0, 1), 10^seq(1,-3), 10^seq(1,-5)); + trainArgs = list(X=X, Y=Y, intercept=-1, lambda=-1, epsilon=-1, maxIterations=1000, verbose=FALSE); + dataArgs = list("X", "Y"); + [B1,opt] = gridSearch(X=X, y=Y, train="msvm", predict="accuracyMSVM", numB=(ncol(X)+1)*(nc), + params=params, paramValues=paramRanges, dataArgs=dataArgs, trainArgs=trainArgs, cv=TRUE, cvk=3, verbose=TRUE); + evalFunHp = as.matrix(opt) # opt # + # opt = matrix("2 10 0.001", rows=1, cols=3) + # evalFunHp = opt + } + if(min(Y) == max(Y)) + { + accuracy = as.matrix(0) + a = 0 + } + else { + beta = msvm(X=X, Y=Y, intercept=as.scalar(evalFunHp[1,1]), lambda=as.scalar(evalFunHp[1,2]), epsilon=as.scalar(evalFunHp[1,3]), + maxIterations=1000, verbose=FALSE); + yhat = msvmPredict(X=Xtest, W=beta); + yhat = rowIndexMax(yhat) + accuracy = mean(yhat == Ytest) * 100 + error = yhat != Ytest + accuracy = as.matrix(accuracy) + } + output = cbind(accuracy, evalFunHp) +} +accuracyMSVM = function(Matrix[Double] X, Matrix[Double] y, Matrix[Double] B) return (Matrix[Double] err) { + yhat = msvmPredict(X=X, W=B); + yhat = rowIndexMax(yhat) + acc = mean(yhat == y) + err = as.matrix(1-(acc)); } \ No newline at end of file