From cc9ff3bdc5a0a6bd0cbcf495959590319532ab5c Mon Sep 17 00:00:00 2001 From: baunsgaard Date: Thu, 11 May 2023 13:43:52 +0200 Subject: [PATCH] [SYSTEMDS-3490] Compressed Transform Encode Binning This commit adds more compresed transform encode primitives. Among others the Bin to Dummy is included. Also added are some more generic tests of the transform encode primitive. Closes #1824 --- .gitignore | 1 + .../sysds/lops/OperatorOrderingUtils.java | 9 +- .../compress/CompressedMatrixBlock.java | 24 +-- .../compress/colgroup/ADictBasedColGroup.java | 25 +++- .../compress/colgroup/ColGroupConst.java | 10 +- .../colgroup/ColGroupSDCSingleZeros.java | 5 +- .../compress/colgroup/ColGroupSDCZeros.java | 4 +- .../dictionary/IdentityDictionary.java | 25 ++-- .../dictionary/IdentityDictionarySlice.java | 5 - .../compress/colgroup/offset/AOffset.java | 10 +- .../runtime/compress/io/WriterCompressed.java | 4 +- .../compress/lib/CLALibDecompress.java | 8 +- .../runtime/compress/lib/CLALibSlice.java | 63 +++++++- .../runtime/data/SparseBlockFactory.java | 16 ++ .../frame/data/columns/StringArray.java | 30 +++- .../transform/encode/ColumnEncoderBin.java | 92 +++++++----- .../encode/ColumnEncoderComposite.java | 11 ++ .../transform/encode/CompressedEncode.java | 78 ++++++++++ .../transform/encode/MultiColumnEncoder.java | 9 +- ...ava => TransformCompressedTestLogger.java} | 97 ++++++------ .../TransformCompressedTestMultiCol.java | 139 ++++++++++++++++++ .../TransformCompressedTestSingleCol.java | 138 +++++++++++++++++ ...ormCompressedTestSingleColBinSpecific.java | 130 ++++++++++++++++ ...stomTest.java => TransformCustomTest.java} | 6 +- ...BuiltinTopkCleaningClassificationTest.java | 2 +- .../TransformFrameEncodeApplyTest.java | 25 ++-- .../classification/applyFunc.csv | 3 - .../intermediates/classification/bestAcc.csv | 3 - .../classification/dirtyScore.csv | 1 - .../intermediates/classification/evalHp.csv | 1 - .../classification/featureFrame.csv | 1 - .../intermediates/classification/hp.csv | 3 - .../intermediates/classification/lp.csv | 37 ----- .../intermediates/classification/pip.csv | 3 - 34 files changed, 812 insertions(+), 206 deletions(-) rename src/test/java/org/apache/sysds/test/component/frame/transform/{transformCompressed.java => TransformCompressedTestLogger.java} (60%) create mode 100644 src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestMultiCol.java create mode 100644 src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleCol.java create mode 100644 src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleColBinSpecific.java rename src/test/java/org/apache/sysds/test/component/frame/transform/{transformCustomTest.java => TransformCustomTest.java} (95%) delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/hp.csv delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/lp.csv delete mode 100644 src/test/scripts/functions/pipelines/intermediates/classification/pip.csv diff --git a/.gitignore b/.gitignore index db5c0057338..5bb5d8a11a6 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,4 @@ scripts/perftest/fed/temp src/test/scripts/functions/iogen/*.raw src/test/scripts/functions/pipelines/intermediates/regression/*.csv src/test/scripts/functions/pipelines/intermediates/regression/*.csv.mtd +src/test/scripts/functions/pipelines/intermediates/classification/* diff --git a/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java b/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java index add21f160f1..01acd6dfe47 100644 --- a/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java +++ b/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java @@ -19,16 +19,15 @@ package org.apache.sysds.lops; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Map; + import org.apache.sysds.common.Types; import org.apache.sysds.hops.AggBinaryOp; import org.apache.sysds.parser.DMLProgram; import org.apache.sysds.parser.StatementBlock; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Map; - public class OperatorOrderingUtils { // Return a list representation of all the lops in a SB diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java index 230019f68be..d1d4af99b0f 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java @@ -404,16 +404,18 @@ public void write(DataOutput out) throws IOException { if(nonZeros > 0 && estDisk > estimateUncompressed) { // If the size of this matrixBlock is smaller in uncompressed format, then // decompress and save inside an uncompressed column group. - MatrixBlock uncompressed = getUncompressed( - "smaller serialization size: compressed: " + estDisk + " vs uncompressed: " + estimateUncompressed); - ColGroupUncompressed cg = (ColGroupUncompressed) ColGroupUncompressed.create(uncompressed); - - if( estDisk / 10 > estimateUncompressed){ - LOG.error(this); - } + final String message = "smaller serialization size: compressed: " + estDisk + " vs uncompressed: " + + estimateUncompressed; + final MatrixBlock uncompressed = getUncompressed(message); + uncompressed.examSparsity(true); + // Here only Empty or Uncompressed should be returned. + AColGroup cg = ColGroupUncompressed.create(uncompressed); allocateColGroup(cg); - nonZeros = cg.getNumberNonZeros(rlen); - // clear the soft reference to the decompressed version, since the one column group is perfectly, + // update non zeros, if not fully correct in compressed block + nonZeros = cg.getNumberNonZeros(rlen); + + // Clear the soft reference to the decompressed version, + // since the one column group is perfectly, // representing the decompressed version. clearSoftReferenceToDecompressed(); } @@ -618,7 +620,7 @@ public MatrixBlock unaryOperations(UnaryOperator op, MatrixValue result) { @Override public boolean containsValue(double pattern) { // Only if pattern is a finite value and overlapping then decompress. - if(isOverlapping() && Double.isFinite(pattern)) + if(isOverlapping() && Double.isFinite(pattern)) return getUncompressed("ContainsValue").containsValue(pattern); else { for(AColGroup g : _colGroups) @@ -1072,7 +1074,7 @@ public void clearSoftReferenceToDecompressed() { decompressedVersion = null; } - public void clearCounts(){ + public void clearCounts() { for(AColGroup a : _colGroups) a.clear(); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java index f6367661880..e27ffcd9c7e 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java @@ -26,6 +26,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; @@ -58,7 +59,17 @@ public ADictionary getDictionary() { @Override public final void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - if(_dict instanceof MatrixBlockDictionary) { + if(_dict instanceof IdentityDictionary){ + + final MatrixBlockDictionary md = ((IdentityDictionary)_dict).getMBDict(); + final MatrixBlock mb = md.getMatrixBlock(); + // The dictionary is never empty. + if(mb.isInSparseFormat()) + decompressToDenseBlockSparseDictionary(db, rl, ru, offR, offC, mb.getSparseBlock()); + else + decompressToDenseBlockDenseDictionary(db, rl, ru, offR, offC, mb.getDenseBlockValues()); + } + else if(_dict instanceof MatrixBlockDictionary) { final MatrixBlockDictionary md = (MatrixBlockDictionary) _dict; final MatrixBlock mb = md.getMatrixBlock(); // The dictionary is never empty. @@ -73,7 +84,17 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR @Override public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { - if(_dict instanceof MatrixBlockDictionary) { + if(_dict instanceof IdentityDictionary){ + + final MatrixBlockDictionary md = ((IdentityDictionary)_dict).getMBDict(); + final MatrixBlock mb = md.getMatrixBlock(); + // The dictionary is never empty. + if(mb.isInSparseFormat()) + decompressToSparseBlockSparseDictionary(sb, rl, ru, offR, offC, mb.getSparseBlock()); + else + decompressToSparseBlockDenseDictionary(sb, rl, ru, offR, offC, mb.getDenseBlockValues()); + } + else if(_dict instanceof MatrixBlockDictionary) { final MatrixBlockDictionary md = (MatrixBlockDictionary) _dict; final MatrixBlock mb = md.getMatrixBlock(); // The dictionary is never empty. diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java index b0b2484ca25..2f94c3186a5 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java @@ -26,6 +26,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; @@ -305,7 +306,14 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa * @param constV The output columns. */ public final void addToCommon(double[] constV) { - if(_dict instanceof MatrixBlockDictionary) { + if(_dict instanceof IdentityDictionary){ + MatrixBlock mb = ((IdentityDictionary) _dict).getMBDict().getMatrixBlock(); + if(mb.isInSparseFormat()) + addToCommonSparse(constV, mb.getSparseBlock()); + else + addToCommonDense(constV, mb.getDenseBlockValues()); + } + else if(_dict instanceof MatrixBlockDictionary) { MatrixBlock mb = ((MatrixBlockDictionary) _dict).getMatrixBlock(); if(mb.isInSparseFormat()) addToCommonSparse(constV, mb.getSparseBlock()); diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java index eacb71f4efb..954122620cf 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java @@ -96,10 +96,11 @@ protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int if(it == null) return; else if(it.value() >= ru) - _indexes.cacheIterator(it, ru); + return; + // _indexes.cacheIterator(it, ru); else { decompressToDenseBlockDenseDictionaryWithProvidedIterator(db, rl, ru, offR, offC, values, it); - _indexes.cacheIterator(it, ru); + // _indexes.cacheIterator(it, ru); } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java index a1700b16fe7..2e99179eaf4 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java @@ -113,7 +113,7 @@ else if(it.value() >= ru) _indexes.cacheIterator(it, ru); else { decompressToDenseBlockDenseDictionaryWithProvidedIterator(db, rl, ru, offR, offC, values, it); - _indexes.cacheIterator(it, ru); + // _indexes.cacheIterator(it, ru); } } @@ -322,7 +322,7 @@ private final void decompressToDenseBlockSparseDictionaryPre(DenseBlock db, int it.next(); } - _indexes.cacheIterator(it, ru); + // _indexes.cacheIterator(it, ru); } @Override diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java index 43afc6ebd81..d80a5cca623 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java @@ -26,10 +26,10 @@ import java.util.Arrays; import org.apache.commons.lang.NotImplementedException; -import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.DMLCompressionException; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.data.SparseBlock; +import org.apache.sysds.runtime.data.SparseBlockFactory; import org.apache.sysds.runtime.functionobjects.Builtin; import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode; import org.apache.sysds.runtime.functionobjects.ValueFunction; @@ -61,13 +61,13 @@ public IdentityDictionary(int nRowCol) { @Override public double[] getValues() { - LOG.warn("Should not call getValues on Identity Dictionary"); - - double[] ret = new double[nRowCol * nRowCol]; - for(int i = 0; i < nRowCol; i++) { - ret[(i * nRowCol) + i] = 1; - } - return ret; + throw new DMLCompressionException("Invalid to materialize identity Matrix Please Implement alternative"); + // LOG.warn("Should not call getValues on Identity Dictionary"); + // double[] ret = new double[nRowCol * nRowCol]; + // for(int i = 0; i < nRowCol; i++) { + // ret[(i * nRowCol) + i] = 1; + // } + // return ret; } @Override @@ -383,8 +383,7 @@ public ADictionary subtractTuple(double[] tuple) { } public MatrixBlockDictionary getMBDict() { - throw new DMLRuntimeException("Do not make MB Dict"); - // return getMBDict(nRowCol); + return getMBDict(nRowCol); } @Override @@ -400,10 +399,8 @@ public MatrixBlockDictionary getMBDict(int nCol) { } private MatrixBlockDictionary createMBDict() { - MatrixBlock identity = new MatrixBlock(nRowCol, nRowCol, true); - for(int i = 0; i < nRowCol; i++) - identity.quickSetValue(i, i, 1.0); - + final SparseBlock sb = SparseBlockFactory.createIdentityMatrix(nRowCol); + final MatrixBlock identity = new MatrixBlock(nRowCol, nRowCol, nRowCol, sb); return new MatrixBlockDictionary(identity); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java index 53dfc3227b7..55315b34516 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java @@ -217,11 +217,6 @@ public long getNumberNonZeros(int[] counts, int nCol) { return (long) sum(counts, nCol); } - - public MatrixBlockDictionary getMBDict() { - return getMBDict(nRowCol); - } - @Override public MatrixBlockDictionary getMBDict(int nCol) { if(cache != null) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java index 1e6767a649f..c17c761cccc 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java @@ -95,7 +95,7 @@ public AIterator getIterator(int row) { return getIterator(); else if(row > getOffsetToLast()) return null; - final OffsetCache c = cacheRow.get(); + final OffsetCache c = getLength() < skipStride ? null : cacheRow.get(); if(c != null && c.row == row) return c.it.clone(); else if(getLength() < skipStride) @@ -169,7 +169,7 @@ public AOffsetIterator getOffsetIterator(int row) { * @param row The row index to cache the iterator as. */ public void cacheIterator(AIterator it, int row) { - if(it == null) + if(it == null || getLength() < skipStride) return; cacheRow.set(new OffsetCache(it, row)); } @@ -446,6 +446,12 @@ public boolean equals(AOffset b) { protected abstract AOffset moveIndex(int m); + /** + * Get the length of the underlying array. This does not reflect the number of contained elements, since some of the + * elements can be skips. + * + * @return The length of the underlying arrays + */ protected abstract int getLength(); public OffsetSliceInfo slice(int l, int u) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java index 8c692fe1581..31852dde5bc 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java @@ -180,8 +180,10 @@ private void writeMultiBlockCompressed(String fname, MatrixBlock b, final int rl for(int bc = 0; bc * blen < clen; bc++) {// column blocks final int sC = bc * blen; final int mC = Math.min(sC + blen, clen) - 1; + // slice out the current columns final CompressedMatrixBlock mc = CLALibSlice.sliceColumns((CompressedMatrixBlock) b, sC, mC); - final List blocks = CLALibSlice.sliceBlocks(mc, blen); // Slice compressed blocks + // slice out row blocks in this. + final List blocks = CLALibSlice.sliceBlocks(mc, blen, k); // Slice compressed blocks final int blocksPerThread = Math.max(1, blocks.size() / k); for(int block = 0; block < blocks.size(); block += blocksPerThread, i++) { final Path newPath = new Path(fname, IOUtilFunctions.getPartFileName(i)); diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java index f6bb86c30b7..f92533466b0 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java +++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java @@ -177,20 +177,18 @@ private static MatrixBlock decompressExecute(CompressedMatrixBlock cmb, int k) { if(k == 1) { if(ret.isInSparseFormat()) { decompressSparseSingleThread(ret, filteredGroups, nRows, blklen); - ret.setNonZeros(nonZeros); } else { decompressDenseSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping); - ret.recomputeNonZeros(); - // ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros); } } else if(ret.isInSparseFormat()) { decompressSparseMultiThread(ret, filteredGroups, nRows, blklen, k); - ret.setNonZeros(nonZeros); } - else + else{ decompressDenseMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, k, overlapping); + } + ret.recomputeNonZeros(); ret.examSparsity(); return ret; diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java index 32fa6c1b595..4b4ab0e4298 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java +++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java @@ -21,14 +21,19 @@ import java.util.ArrayList; import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; import org.apache.sysds.runtime.compress.colgroup.AColGroup; import org.apache.sysds.runtime.compress.colgroup.ColGroupConst; import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.util.CommonThreadPool; public class CLALibSlice { @@ -39,13 +44,16 @@ public class CLALibSlice { * * @param cmb The input block to slice. * @param blen The length of the blocks. + * @param k The parallelization degree used * @return A list containing CompressedMatrixBlocks or MatrixBlocks */ - public static List sliceBlocks(CompressedMatrixBlock cmb, int blen) { - final List mbs = new ArrayList<>(); - for(int b = 0; b < cmb.getNumRows(); b += blen) - mbs.add(sliceRowsCompressed(cmb, b, Math.min(b + blen, cmb.getNumRows()) - 1)); - return mbs; + public static List sliceBlocks(CompressedMatrixBlock cmb, int blen, int k) { + + if(k <= 1) + return sliceBlocksSingleThread(cmb, blen); + else + return sliceBlocksMultiThread(cmb, blen, k); + } public static MatrixBlock slice(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu, boolean deep) { @@ -59,6 +67,34 @@ else if(cl == 0 && cu == cmb.getNumColumns() - 1) return sliceInternal(cmb, rl, ru, cl, cu, deep); } + private static List sliceBlocksSingleThread(CompressedMatrixBlock cmb, int blen) { + final List mbs = new ArrayList<>(); + for(int b = 0; b < cmb.getNumRows(); b += blen) + mbs.add(sliceRowsCompressed(cmb, b, Math.min(b + blen, cmb.getNumRows()) - 1)); + return mbs; + } + + private static List sliceBlocksMultiThread(CompressedMatrixBlock cmb, int blen, int k) { + // final List mbs = new ArrayList<>(); + + final ExecutorService pool = CommonThreadPool.get(k); + try { + final ArrayList tasks = new ArrayList<>(); + + for(int b = 0; b < cmb.getNumRows(); b += blen) + tasks.add(new SliceTask(cmb, b, blen)); + final List mbs = new ArrayList<>(tasks.size()); + for(Future f : pool.invokeAll(tasks)) + mbs.add(f.get()); + pool.shutdown(); + return mbs; + } + catch(Exception e) { + pool.shutdown(); + throw new DMLRuntimeException("Failed slicing compressed matrix block", e); + } + } + private static MatrixBlock sliceInternal(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu, boolean deep) { /** * In the case where an internal matrix is sliced out, then first slice out the columns to an compressed @@ -148,4 +184,21 @@ public static CompressedMatrixBlock sliceColumns(CompressedMatrixBlock cmb, int ret.setOverlapping(cmb.isOverlapping()); return ret; } + + private static class SliceTask implements Callable { + private final CompressedMatrixBlock cmb; + private final int b; + private final int blen; + + private SliceTask(CompressedMatrixBlock cmb, int b, int blen) { + this.cmb = cmb; + this.b = b; + this.blen = blen; + } + + @Override + public MatrixBlock call() throws Exception { + return sliceRowsCompressed(cmb, b, Math.min(b + blen, cmb.getNumRows()) - 1); + } + } } diff --git a/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java b/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java index fdaf3d7460e..18bd68489e0 100644 --- a/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java +++ b/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java @@ -84,4 +84,20 @@ public static long estimateSizeSparseInMemory(SparseBlock.Type type, long nrows, throw new RuntimeException("Unexpected sparse block type: "+type.toString()); } } + + public static SparseBlock createIdentityMatrix(int nRowCol){ + final int[] rowPtr = new int[nRowCol+1]; + final int[] colIdx = new int[nRowCol]; + final double[] vals = new double[nRowCol]; + int nnz = nRowCol; + + for(int i = 0; i < nRowCol; i++){ + rowPtr[i] = i; + colIdx[i] = i; + vals[i] = 1; + } + rowPtr[nRowCol] = nRowCol; // add last index for row pointers. + + return new SparseBlockCSR(rowPtr, colIdx, vals, nnz); + } } diff --git a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java index 862014b39eb..4e89dbf548a 100644 --- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java +++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java @@ -561,12 +561,38 @@ public void fill(String value) { @Override public double getAsDouble(int i) { - return _data[i] != null && !_data[i].isEmpty() ? DoubleArray.parseDouble(_data[i]) : 0.0; + if(_data[i] != null && !_data[i].isEmpty()){ + return getAsDouble(_data[i]); + } + else{ + return 0.0; + } } @Override public double getAsNaNDouble(int i) { - return _data[i] != null && !_data[i].isEmpty() ? DoubleArray.parseDouble(_data[i]) : Double.NaN; + if(_data[i] != null && !_data[i].isEmpty()){ + return getAsDouble(_data[i]); + } + else{ + return Double.NaN; + } + } + + private static double getAsDouble(String s){ + try{ + + return DoubleArray.parseDouble(s); + } + catch(Exception e){ + String ls = s.toLowerCase(); + if(ls.equals("true") || ls.equals("t")) + return 1; + else if (ls.equals("false") || ls.equals("f")) + return 0; + else + throw new DMLRuntimeException("Unable to change to double: " + s, e); + } } @Override diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java index b532dc04be4..7051d3f3015 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java @@ -19,6 +19,8 @@ package org.apache.sysds.runtime.transform.encode; +import static org.apache.sysds.runtime.util.UtilFunctions.getEndIndex; + import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; @@ -27,7 +29,6 @@ import java.util.PriorityQueue; import java.util.concurrent.Callable; -import static org.apache.sysds.runtime.util.UtilFunctions.getEndIndex; import org.apache.commons.lang3.tuple.MutableTriple; import org.apache.sysds.api.DMLScript; import org.apache.sysds.lops.Lop; @@ -137,58 +138,67 @@ else if(_binMethod == BinMethod.EQUI_HEIGHT) { protected double getCode(CacheBlock in, int row){ // find the right bucket for a single row - double bin = 0; if( _binMins.length == 0 || _binMaxs.length == 0 ) { LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1"); return 1; //robustness in case of missing bins } // Returns NaN if value is missing, so can't be assigned a Bin double inVal = in.getDoubleNaN(row, _colID - 1); - if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > _binMaxs[_binMaxs.length-1]) - return Double.NaN; - if (_binMethod == BinMethod.EQUI_HEIGHT) { - int ix = Arrays.binarySearch(_binMaxs, inVal); - bin = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1; - } - if (_binMethod == BinMethod.EQUI_WIDTH) { - //TODO: Skip computing bin boundaries for equi-width - double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin; - double code = Math.ceil((inVal - _binMins[0]) / binWidth); - bin = (code == 0) ? code + 1 : code; - } - return bin; + return getCodeIndex(inVal); } @Override protected double[] getCodeCol(CacheBlock in, int startInd, int blkSize) { // find the right bucket for a block of rows - int endInd = getEndIndex(in.getNumRows(), startInd, blkSize); - double[] codes = new double[endInd-startInd]; + final int endInd = getEndIndex(in.getNumRows(), startInd, blkSize); + final double[] codes = new double[endInd - startInd]; + if (_binMins == null || _binMins.length == 0 || _binMaxs.length == 0) { + LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1"); + Arrays.fill(codes, startInd, endInd, 1.0); + return codes; + } for (int i=startInd; i _binMaxs[_binMaxs.length-1]) { - codes[i-startInd] = Double.NaN; - continue; - } - if (_binMethod == BinMethod.EQUI_HEIGHT) { - int ix = Arrays.binarySearch(_binMaxs, inVal); - codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1; - } - if (_binMethod == BinMethod.EQUI_WIDTH) { - //TODO: Skip computing bin boundaries for equi-width - double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin; - double bin = Math.ceil((inVal - _binMins[0]) / binWidth); - codes[i - startInd] = bin == 0 ? bin + 1 : bin; - } + codes[i- startInd] = getCodeIndex(inVal); } return codes; } + protected double getCodeIndex(double inVal){ + // throw new NotImplementedException("Intensional"); + if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > _binMaxs[_binMaxs.length-1]){ + return Double.NaN; + } + else if (_binMethod == BinMethod.EQUI_HEIGHT) { + final int ix = Arrays.binarySearch(_binMaxs, inVal); + + if(ix < 0) // somewhere in between values + // +2 because negative values are found from binary search. + // plus 2 to correct for the absolute value of that. + return Math.abs(ix + 1) + 1; + else if (ix == 0) // If first bucket boundary add it there. + return 1; + else + // precisely at boundaries default to lower bucket + // This is done to avoid using an extra bucket for max value. + return Math.min(ix + 1, _binMaxs.length); + } + else { //if (_binMethod == BinMethod.EQUI_WIDTH) { + final double max = _binMaxs[_binMaxs.length-1]; + final double min = _binMins[0]; + + if(max == min){ + return 1; + } + + //TODO: Skip computing bin boundaries for equi-width + double binWidth = (max - min) / _numBin; + double code = Math.ceil((inVal - min) / binWidth); + return (code == 0) ? code + 1 : code; + } + } + + @Override protected TransformType getTransformType() { return TransformType.BIN; @@ -393,6 +403,16 @@ public String toString() { sb.append(getClass().getSimpleName()); sb.append(": "); sb.append(_colID); + sb.append(" --- Method: " + _binMethod + " num Bin: " + _numBin); + if(_binMethod == BinMethod.EQUI_WIDTH){ + + sb.append("\n---- BinMin: "+ Arrays.toString(_binMins)); + sb.append("\n---- BinMax: "+ Arrays.toString(_binMaxs)); + } + else{ + + sb.append(" --- MinMax: "+ _colMins + " " + _colMaxs); + } return sb.toString(); } diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java index a033bfa30fc..8b9710f71d6 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java @@ -435,6 +435,17 @@ public boolean isPassThrough(){ && _columnEncoders.get(0) instanceof ColumnEncoderPassThrough; } + public boolean isBin(){ + return _columnEncoders.size() == 1// + && _columnEncoders.get(0) instanceof ColumnEncoderBin; + } + + public boolean isBinToDummy(){ + return _columnEncoders.size() == 2// + && _columnEncoders.get(0) instanceof ColumnEncoderBin// + && _columnEncoders.get(1) instanceof ColumnEncoderDummycode; + } + private static class ColumnCompositeUpdateDCTask implements Callable { private final ColumnEncoderComposite _encoder; diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java b/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java index 1605e08e7dc..8a60aae6ff7 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java @@ -36,6 +36,7 @@ import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.ColGroupConst; import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC; import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed; import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary; @@ -136,6 +137,10 @@ else if(c.isRecode()) return recode(c); else if(c.isPassThrough()) return passThrough(c); + else if(c.isBin()) + return bin(c); + else if(c.isBinToDummy()) + return binToDummy(c); else throw new NotImplementedException("Not supporting : " + c); } @@ -147,6 +152,8 @@ private AColGroup recodeToDummy(ColumnEncoderComposite c) { HashMap map = a.getRecodeMap(); int domain = map.size(); IColIndex colIndexes = ColIndexFactory.create(0, domain); + if(domain == 1) + return ColGroupConst.create(colIndexes, new double[] {1}); ADictionary d = new IdentityDictionary(colIndexes.size()); AMapToData m = createMappingAMapToData(a, map); List r = c.getEncoders(); @@ -154,6 +161,75 @@ private AColGroup recodeToDummy(ColumnEncoderComposite c) { return ColGroupDDC.create(colIndexes, d, m, null); } + private AColGroup bin(ColumnEncoderComposite c) { + final int colId = c._colID; + final Array a = in.getColumn(colId - 1); + final boolean containsNull = a.containsNull(); + final List r = c.getEncoders(); + final ColumnEncoderBin b = (ColumnEncoderBin) r.get(0); + b.build(in); + final IColIndex colIndexes = ColIndexFactory.create(1); + + ADictionary d = createIncrementingVector(b._numBin, containsNull); + AMapToData m = binEncode(a, b, containsNull); + + AColGroup ret = ColGroupDDC.create(colIndexes, d, m, null); + try{ + + ret.getNumberNonZeros(a.size()); + } + catch(Exception e){ + throw new DMLRuntimeException("Failed binning \n\n" + a + "\n" + b + "\n" + d + "\n" + m,e); + } + return ret; + } + + private AMapToData binEncode(Array a, ColumnEncoderBin b, boolean containsNull) { + AMapToData m = MapToFactory.create(a.size(), b._numBin + (containsNull ? 1 : 0)); + if(containsNull) { + for(int i = 0; i < a.size(); i++) { + double v = a.getAsNaNDouble(i); + if(Double.isNaN(v)) + m.set(i, b._numBin); + else + m.set(i, (int) b.getCodeIndex(v) - 1); + } + } + else { + for(int i = 0; i < a.size(); i++) + m.set(i, (int) b.getCodeIndex(a.getAsDouble(i)) - 1); + } + return m; + } + + private MatrixBlockDictionary createIncrementingVector(int nVals, boolean NaN) { + + MatrixBlock bins = new MatrixBlock(nVals + (NaN ? 1 : 0), 1, false); + for(int i = 0; i < nVals; i++) + bins.quickSetValue(i, 0, i + 1); + if(NaN) + bins.quickSetValue(nVals, 0, Double.NaN); + + return MatrixBlockDictionary.create(bins); + + } + + private AColGroup binToDummy(ColumnEncoderComposite c) { + final int colId = c._colID; + final Array a = in.getColumn(colId - 1); + final boolean containsNull = a.containsNull(); + final List r = c.getEncoders(); + final ColumnEncoderBin b = (ColumnEncoderBin) r.get(0); + b.build(in); + + IColIndex colIndexes = ColIndexFactory.create(0, b._numBin + (containsNull ? 1 : 0)); + ADictionary d = new IdentityDictionary(colIndexes.size()); + AMapToData m = binEncode(a, b, containsNull); + AColGroup ret = ColGroupDDC.create(colIndexes, d, m, null); + ret.getNumberNonZeros(a.size()); + return ret; + } + @SuppressWarnings("unchecked") private AColGroup recode(ColumnEncoderComposite c) { int colId = c._colID; @@ -163,6 +239,8 @@ private AColGroup recode(ColumnEncoderComposite c) { // int domain = c.getDomainSize(); IColIndex colIndexes = ColIndexFactory.create(1); + if(domain == 1) + return ColGroupConst.create(colIndexes, new double[] {1}); MatrixBlock incrementing = new MatrixBlock(domain, 1, false); for(int i = 0; i < domain; i++) incrementing.quickSetValue(i, 0, i + 1); diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java index 140a5e18b37..6838cdd1e29 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java @@ -111,11 +111,9 @@ else if(k > 1 && !MULTI_THREADED_STAGES && !hasLegacyEncoder()) { try { pool.submitAllAndWait(getEncodeTasks(in, out, pool)); } - catch(ExecutionException | InterruptedException e) { - LOG.error("MT Column encode failed"); - e.printStackTrace(); + finally{ + pool.shutdown(); } - pool.shutdown(); outputMatrixPostProcessing(out); return out; } @@ -142,8 +140,7 @@ else if(k > 1 && !MULTI_THREADED_STAGES && !hasLegacyEncoder()) { } } catch(Exception ex) { - LOG.error("Failed transform-encode frame with \n" + this); - throw ex; + throw new DMLRuntimeException("Failed transform-encode frame with encoder:\n" + this, ex); } } diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCompressed.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestLogger.java similarity index 60% rename from src/test/java/org/apache/sysds/test/component/frame/transform/transformCompressed.java rename to src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestLogger.java index 08a15c9aa2a..59b79bfe5e6 100644 --- a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCompressed.java +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestLogger.java @@ -19,24 +19,34 @@ package org.apache.sysds.test.component.frame.transform; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.log4j.Appender; +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.spi.LoggingEvent; import org.apache.sysds.common.Types.ValueType; import org.apache.sysds.runtime.frame.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.CompressedEncode; import org.apache.sysds.runtime.transform.encode.EncoderFactory; import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; import org.apache.sysds.test.TestUtils; import org.junit.Test; -public class transformCompressed { - protected static final Log LOG = LogFactory.getLog(transformCompressed.class.getName()); +public class TransformCompressedTestLogger { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestLogger.class.getName()); private final FrameBlock data; - public transformCompressed() { + public TransformCompressedTestLogger() { try { data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT4}, 231); @@ -49,60 +59,35 @@ public transformCompressed() { } } - @Test - public void testRecode() { - test("{recode:[C1]}"); - } - @Test public void testDummyCode() { test("{dummycode:[C1]}"); } - // @Test - // public void testBin() { - // test("{ids:true, bin:[{id:1, method:equi-width, numbins:4}]}"); - // } - - // @Test - // public void testBin2() { - // test("{ids:true, bin:[{id:1, method:equi-width, numbins:100}]}"); - // } - - // @Test - // public void testBin3() { - // test("{ids:true, bin:[{id:1, method:equi-width, numbins:2}]}"); - // } - - // @Test - // public void testBin4() { - // test("{ids:true, bin:[{id:1, method:equi-height, numbins:2}]}"); - // } - - // @Test - // public void testBin5() { - // test("{ids:true, bin:[{id:1, method:equi-height, numbins:10}]}"); - // } - public void test(String spec) { + final TestAppender appender = new TestAppender(); + final Logger logger = Logger.getRootLogger(); + Appender consoleLogger = (Appender) logger.getAllAppenders().nextElement(); try { - + Logger.getLogger(CompressedEncode.class).setLevel(Level.DEBUG); + + FrameBlock meta = null; + logger.removeAppender(consoleLogger); + logger.addAppender(appender); MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), - data.getNumColumns(), meta); + data.getNumColumns(), meta); MatrixBlock outCompressed = encoderCompressed.encode(data, true); FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), - data.getNumColumns(), meta); + data.getNumColumns(), meta); MatrixBlock outNormal = encoderNormal.encode(data); FrameBlock outNormalMD = encoderNormal.getMetaData(null); - - - // LOG.error(outNormal); - // LOG.error(outCompressed); - // LOG.error(outCompressedMD); - // LOG.error(outNormalMD); + logger.removeAppender(appender); + logger.addAppender(consoleLogger); + final List log = appender.getLog(); + assertTrue(log.get(3).getMessage().toString().contains("Compression ratio")); TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); TestUtils.compareFrames(outNormalMD, outCompressedMD, true); } @@ -110,5 +95,33 @@ public void test(String spec) { e.printStackTrace(); fail(e.getMessage()); } + finally { + logger.removeAppender(appender); + logger.addAppender(consoleLogger); + } + + } + + class TestAppender extends AppenderSkeleton { + private final List log = new ArrayList(); + + @Override + public boolean requiresLayout() { + return false; + } + + @Override + protected void append(final LoggingEvent loggingEvent) { + log.add(loggingEvent); + } + + @Override + public void close() { + } + + public List getLog() { + return new ArrayList(log); + } + } } diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestMultiCol.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestMultiCol.java new file mode 100644 index 00000000000..e945ad5b269 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestMultiCol.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.frame.transform; + +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(value = Parameterized.class) +public class TransformCompressedTestMultiCol { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestMultiCol.class.getName()); + + private final FrameBlock data; + private final int k; + + public TransformCompressedTestMultiCol(FrameBlock data, int k) { + this.data = data; + this.k = k; + } + + @Parameters + public static Collection data() { + final ArrayList tests = new ArrayList<>(); + final int[] threads = new int[] {1, 4}; + try { + + FrameBlock data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT4, ValueType.UINT8, ValueType.UINT4}, 231); + data.setSchema(new ValueType[] {ValueType.INT32, ValueType.INT32, ValueType.INT32}); + for(int k : threads) { + tests.add(new Object[] {data, k}); + } + + FrameBlock data2 = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.BOOLEAN, ValueType.UINT8, ValueType.UINT4}, 231); + data2.setSchema(new ValueType[] {ValueType.BOOLEAN, ValueType.INT32, ValueType.INT32}); + for(int k : threads) { + tests.add(new Object[] {data2, k}); + } + + FrameBlock data3 = new FrameBlock( + new ValueType[] {ValueType.BOOLEAN, ValueType.INT32, ValueType.INT32}, 100) ; + data3.ensureAllocatedColumns(100); + for(int k : threads) + tests.add(new Object[] {data3, k}); + + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + return tests; + } + + @Test + public void testRecode() { + test("{recode:[C1, C2, C3]}"); + } + + @Test + public void testDummyCode() { + test("{dummycode:[C1,C2,C3]}"); + } + + @Test + public void testBin() { + test( + "{ids:true, bin:[{id:1, method:equi-width, numbins:4},{id:2, method:equi-width, numbins:16},{id:3, method:equi-width, numbins:32}]}"); + } + + @Test + public void testBin3() { + test("{ids:true, bin:[{id:2, method:equi-width, numbins:2},{id:3, method:equi-width, numbins:200}]}"); + } + + @Test + public void passThrough() { + test("{ids:true}"); + } + + @Test + public void testBinToDummy() { + test( + "{ids:true, bin:[{id:1, method:equi-height, numbins:10},{id:2, method:equi-height, numbins:10},{id:3, method:equi-height, numbins:40}], dummycode:[1,2,3] }"); + } + + public void test(String spec) { + try { + + FrameBlock meta = null; + MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + + MatrixBlock outCompressed = encoderCompressed.encode(data, k, true); + FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); + MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + MatrixBlock outNormal = encoderNormal.encode(data, k); + FrameBlock outNormalMD = encoderNormal.getMetaData(null); + + + TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); + TestUtils.compareFrames(outNormalMD, outCompressedMD, true); + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleCol.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleCol.java new file mode 100644 index 00000000000..9855b359fa6 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleCol.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.frame.transform; + +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(value = Parameterized.class) +public class TransformCompressedTestSingleCol { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestSingleCol.class.getName()); + + private final FrameBlock data; + private final int k; + + public TransformCompressedTestSingleCol(FrameBlock data, int k) { + this.data = data; + this.k = k; + } + + @Parameters + public static Collection data() { + final ArrayList tests = new ArrayList<>(); + final int[] threads = new int[] {1, 4}; + try { + + FrameBlock data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT4}, 231); + data.setSchema(new ValueType[] {ValueType.INT32}); + for(int k : threads) { + tests.add(new Object[] {data, k}); + } + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + return tests; + } + + @Test + public void testRecode() { + test("{recode:[C1]}"); + } + + @Test + public void testDummyCode() { + test("{dummycode:[C1]}"); + } + + @Test + public void testBin() { + test("{ids:true, bin:[{id:1, method:equi-width, numbins:4}]}"); + } + + @Test + public void testBin2() { + test("{ids:true, bin:[{id:1, method:equi-width, numbins:100}]}"); + } + + @Test + public void testBin3() { + test("{ids:true, bin:[{id:1, method:equi-width, numbins:2}]}"); + } + + @Test + public void testBin4() { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:2}]}"); + } + + @Test + public void testBin5() { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:10}]}"); + } + + @Test + public void passThrough() { + test("{ids:true}"); + } + + @Test + public void testBinToDummy() { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:10}], dummycode:[1] }"); + } + + public void test(String spec) { + try { + + FrameBlock meta = null; + MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + + MatrixBlock outCompressed = encoderCompressed.encode(data, k, true); + FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); + MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + MatrixBlock outNormal = encoderNormal.encode(data, k); + FrameBlock outNormalMD = encoderNormal.getMetaData(null); + + TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); + TestUtils.compareFrames(outNormalMD, outCompressedMD, true); + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleColBinSpecific.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleColBinSpecific.java new file mode 100644 index 00000000000..fffe66ab959 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleColBinSpecific.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.frame.transform; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(value = Parameterized.class) +public class TransformCompressedTestSingleColBinSpecific { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestSingleColBinSpecific.class.getName()); + + private final FrameBlock data; + private final int k; + + public TransformCompressedTestSingleColBinSpecific(FrameBlock data, int k) { + this.data = data; + this.k = k; + } + + @Parameters + public static Collection data() { + final ArrayList tests = new ArrayList<>(); + final int[] threads = new int[] {1, 4}; + try { + + FrameBlock data = TestUtils.generateRandomFrameBlock(120, new ValueType[] {ValueType.FP64}, 231); + data.setSchema(new ValueType[] {ValueType.FP64}); + for(int k : threads) { + tests.add(new Object[] {data, k}); + } + + FrameBlock data2 = TestUtils.generateRandomFrameBlock(1200, new ValueType[] {ValueType.FP64}, 231); + data2.setSchema(new ValueType[] {ValueType.FP64}); + for(int k : threads) { + tests.add(new Object[] {data2, k}); + } + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + return tests; + } + + @Test + public void test1() { + test(1); + } + + @Test + public void test2() { + test(2); + } + + @Test + public void test3() { + test(3); + } + + @Test + public void test4() { + test(4); + } + + public void test(int bin) { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:" + bin + "}], dummycode:[1] }"); + } + + public void test(String spec) { + try { + + FrameBlock meta = null; + MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + + MatrixBlock outCompressed = encoderCompressed.encode(data, k, true); + FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); + MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + MatrixBlock outNormal = encoderNormal.encode(data, k); + FrameBlock outNormalMD = encoderNormal.getMetaData(null); + + TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); + TestUtils.compareFrames(outNormalMD, outCompressedMD, true); + + // Assert that each bucket has the same number of elements + MatrixBlock colSum = outNormal.colSum(); + for(int i = 0; i < colSum.getNumColumns(); i++) + assertEquals(colSum.quickGetValue(0, 0), colSum.quickGetValue(0, i), 0.001); + } + + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCustomTest.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java similarity index 95% rename from src/test/java/org/apache/sysds/test/component/frame/transform/transformCustomTest.java rename to src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java index 67b23dd32a0..ce7c5d17d94 100644 --- a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCustomTest.java +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java @@ -31,12 +31,12 @@ import org.apache.sysds.test.TestUtils; import org.junit.Test; -public class transformCustomTest { - protected static final Log LOG = LogFactory.getLog(transformCustomTest.class.getName()); +public class TransformCustomTest { + protected static final Log LOG = LogFactory.getLog(TransformCustomTest.class.getName()); final FrameBlock data; - public transformCustomTest() { + public TransformCustomTest() { data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT8}, 231); data.setSchema(new ValueType[] {ValueType.INT32}); } diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java index c91f3ddfc5c..77b6078c215 100644 --- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java +++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java @@ -83,7 +83,7 @@ private void runtopkCleaning(String data, String meta, Double sample, int topk, "metaData="+meta, "primitives="+PRIMITIVES, "parameters="+PARAM, "topk="+ topk, "rv="+ resources, "expectedIncrease="+inc, "max_iter="+10, "sample="+sample, "testCV="+cv, "cvk="+cvk, "split="+split, "output="+OUTPUT, "O="+output("O")}; - runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + runTest(null); //expected loss smaller than default invocation Assert.assertTrue(TestUtils.readDMLBoolean(output("O"))); diff --git a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java index 5aa586df366..ecf29502ab2 100644 --- a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java @@ -19,8 +19,8 @@ package org.apache.sysds.test.functions.transform; -import org.junit.Assert; -import org.junit.Test; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types.ExecMode; import org.apache.sysds.common.Types.FileFormat; @@ -30,8 +30,12 @@ import org.apache.sysds.test.TestConfiguration; import org.apache.sysds.test.TestUtils; import org.apache.sysds.utils.Statistics; +import org.junit.Assert; +import org.junit.Test; public class TransformFrameEncodeApplyTest extends AutomatedTestBase { + protected static final Log LOG = LogFactory.getLog(TransformFrameEncodeApplyTest.class.getName()); + private final static String TEST_NAME1 = "TransformFrameEncodeApply"; private final static String TEST_DIR = "functions/transform/"; private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplyTest.class.getSimpleName() + "/"; @@ -462,14 +466,17 @@ private void runTransformTest( ExecMode rt, String ofmt, TransformType type, boo } } else if (type == TransformType.BIN_HEIGHT_DUMMY) { Assert.assertEquals(14, R1[0].length); - for(int i=0; i<7; i++) { - for(int j=0; j<4; j++) { //check dummy coded - Assert.assertEquals((j==BIN_HEIGHT_col3[i]-1)? - 1:0, R1[i][2+j], 1e-8); + // LOG.error(DataConverter.convertToMatrixBlock(R1).slice(0,6)); + for(int i = 0; i < 7; i++) {// first 7 rows. + // LOG.error(Arrays.toString(BIN_HEIGHT_col3)); + // Check the columns specifically that were dummy coded. + for(int j = 0; j < 4; j++) { // check dummy coded + double val = j == BIN_HEIGHT_col3[i] - 1 ? 1 : 0; + Assert.assertEquals(val, R1[i][2 + j], 1e-8); } - for(int j=0; j<3; j++) { //check dummy coded - Assert.assertEquals((j==BIN_HEIGHT_col8[i]-1)? - 1:0, R1[i][10+j], 1e-8); + for(int j = 0; j < 3; j++) { // check dummy coded + double val = j == BIN_HEIGHT_col8[i] - 1 ? 1 : 0; + Assert.assertEquals(val, R1[i][10 + j], 1e-8); } } } else if (type == TransformType.IMPUTE){ diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv deleted file mode 100644 index 5db997f9a8b..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv +++ /dev/null @@ -1,3 +0,0 @@ -outlierBySdApply,winsorizeApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -normalizeApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -imputeByMeanApply,outlierBySdApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv deleted file mode 100644 index 39fcddd2e11..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ /dev/null @@ -1,3 +0,0 @@ -74.87179487179488 -74.87179487179488 -74.87179487179488 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv deleted file mode 100644 index fae86940b1f..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ /dev/null @@ -1 +0,0 @@ -74.87179487179488 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv deleted file mode 100644 index ec2047209de..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv +++ /dev/null @@ -1 +0,0 @@ -2.0,10.0,0.001 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv b/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv deleted file mode 100644 index e9aacae6a5a..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv +++ /dev/null @@ -1 +0,0 @@ -#MissingValues,MinVla,MaxVal,AverageMin,AverageMax,#CategoricalFeatures,#NumericFeatures,Mean,#Outliers,#OHEfeatures,#Classes,Imbalance,#rows,#cols,pipelines,accuracy,execution time in ms,CV time in ms diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv deleted file mode 100644 index 49cc34605f8..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ /dev/null @@ -1,3 +0,0 @@ -27.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,2.0,0.05,0.95,0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -21.0,0,0,0,0,0,0,0,1.0,0.75,0,0,1.0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -27.0,0,0,0,0,1.0,0,0,0,2.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv deleted file mode 100644 index a7151853e90..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv +++ /dev/null @@ -1,37 +0,0 @@ -outlierBySd,imputeByMedian,flipLabels,0 -imputeByMedian,outlierBySd,flipLabels,0 -abstain,0,0,0 -imputeByFd,abstain,0,0 -abstain,forward_fill,0,0 -flipLabels,0,0,0 -imputeByMedian,flipLabels,0,0 -flipLabels,forward_fill,0,0 -imputeByFd,flipLabels,forward_fill,0 -imputeByFd,forward_fill,flipLabels,0 -imputeByFd,flipLabels,forward_fill,0 -imputeByFd,flipLabels,0,0 -imputeByFd,imputeByFd,flipLabels,0 -imputeByFd,flipLabels,0,0 -imputeByFd,imputeByMean,flipLabels,0 -tomeklink,imputeByFd,abstain,0 -winsorize,0,0,0 -normalize,winsorize,0,0 -abstain,flipLabels,forward_fill,0 -imputeByMedian,0,0,0 -imputeByFd,0,0,0 -imputeByMean,0,0,0 -mice,0,0,0 -forward_fill,0,0,0 -fillDefault,0,0,0 -SMOTE,0,0,0 -scale,0,0,0 -fillDefault,imputeByMedian,0,0 -imputeByFd,imputeByFd,0,0 -imputeByFd,imputeByMean,0,0 -scale,imputeByFd,imputeByMean,0 -imputeByFd,imputeByMean,0,0 -imputeByFd,imputeByFd,imputeByMean,0 -imputeByMean,imputeByFd,imputeByFd,0 -imputeByFd,imputeByFd,0,0 -imputeByFd,forward_fill,imputeByFd,0 -forward_fill,imputeByFd,imputeByFd,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv deleted file mode 100644 index 3fa4259115a..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ /dev/null @@ -1,3 +0,0 @@ -outlierBySd,winsorize,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -normalize,abstain,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -imputeByMean,outlierBySd,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0