diff --git a/.gitignore b/.gitignore index db5c0057338..5bb5d8a11a6 100644 --- a/.gitignore +++ b/.gitignore @@ -139,3 +139,4 @@ scripts/perftest/fed/temp src/test/scripts/functions/iogen/*.raw src/test/scripts/functions/pipelines/intermediates/regression/*.csv src/test/scripts/functions/pipelines/intermediates/regression/*.csv.mtd +src/test/scripts/functions/pipelines/intermediates/classification/* diff --git a/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java b/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java index add21f160f1..01acd6dfe47 100644 --- a/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java +++ b/src/main/java/org/apache/sysds/lops/OperatorOrderingUtils.java @@ -19,16 +19,15 @@ package org.apache.sysds.lops; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Map; + import org.apache.sysds.common.Types; import org.apache.sysds.hops.AggBinaryOp; import org.apache.sysds.parser.DMLProgram; import org.apache.sysds.parser.StatementBlock; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Map; - public class OperatorOrderingUtils { // Return a list representation of all the lops in a SB diff --git a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java index 230019f68be..d1d4af99b0f 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java +++ b/src/main/java/org/apache/sysds/runtime/compress/CompressedMatrixBlock.java @@ -404,16 +404,18 @@ public void write(DataOutput out) throws IOException { if(nonZeros > 0 && estDisk > estimateUncompressed) { // If the size of this matrixBlock is smaller in uncompressed format, then // decompress and save inside an uncompressed column group. - MatrixBlock uncompressed = getUncompressed( - "smaller serialization size: compressed: " + estDisk + " vs uncompressed: " + estimateUncompressed); - ColGroupUncompressed cg = (ColGroupUncompressed) ColGroupUncompressed.create(uncompressed); - - if( estDisk / 10 > estimateUncompressed){ - LOG.error(this); - } + final String message = "smaller serialization size: compressed: " + estDisk + " vs uncompressed: " + + estimateUncompressed; + final MatrixBlock uncompressed = getUncompressed(message); + uncompressed.examSparsity(true); + // Here only Empty or Uncompressed should be returned. + AColGroup cg = ColGroupUncompressed.create(uncompressed); allocateColGroup(cg); - nonZeros = cg.getNumberNonZeros(rlen); - // clear the soft reference to the decompressed version, since the one column group is perfectly, + // update non zeros, if not fully correct in compressed block + nonZeros = cg.getNumberNonZeros(rlen); + + // Clear the soft reference to the decompressed version, + // since the one column group is perfectly, // representing the decompressed version. clearSoftReferenceToDecompressed(); } @@ -618,7 +620,7 @@ public MatrixBlock unaryOperations(UnaryOperator op, MatrixValue result) { @Override public boolean containsValue(double pattern) { // Only if pattern is a finite value and overlapping then decompress. - if(isOverlapping() && Double.isFinite(pattern)) + if(isOverlapping() && Double.isFinite(pattern)) return getUncompressed("ContainsValue").containsValue(pattern); else { for(AColGroup g : _colGroups) @@ -1072,7 +1074,7 @@ public void clearSoftReferenceToDecompressed() { decompressedVersion = null; } - public void clearCounts(){ + public void clearCounts() { for(AColGroup a : _colGroups) a.clear(); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java index f6367661880..e27ffcd9c7e 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ADictBasedColGroup.java @@ -26,6 +26,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; @@ -58,7 +59,17 @@ public ADictionary getDictionary() { @Override public final void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR, int offC) { - if(_dict instanceof MatrixBlockDictionary) { + if(_dict instanceof IdentityDictionary){ + + final MatrixBlockDictionary md = ((IdentityDictionary)_dict).getMBDict(); + final MatrixBlock mb = md.getMatrixBlock(); + // The dictionary is never empty. + if(mb.isInSparseFormat()) + decompressToDenseBlockSparseDictionary(db, rl, ru, offR, offC, mb.getSparseBlock()); + else + decompressToDenseBlockDenseDictionary(db, rl, ru, offR, offC, mb.getDenseBlockValues()); + } + else if(_dict instanceof MatrixBlockDictionary) { final MatrixBlockDictionary md = (MatrixBlockDictionary) _dict; final MatrixBlock mb = md.getMatrixBlock(); // The dictionary is never empty. @@ -73,7 +84,17 @@ public final void decompressToDenseBlock(DenseBlock db, int rl, int ru, int offR @Override public final void decompressToSparseBlock(SparseBlock sb, int rl, int ru, int offR, int offC) { - if(_dict instanceof MatrixBlockDictionary) { + if(_dict instanceof IdentityDictionary){ + + final MatrixBlockDictionary md = ((IdentityDictionary)_dict).getMBDict(); + final MatrixBlock mb = md.getMatrixBlock(); + // The dictionary is never empty. + if(mb.isInSparseFormat()) + decompressToSparseBlockSparseDictionary(sb, rl, ru, offR, offC, mb.getSparseBlock()); + else + decompressToSparseBlockDenseDictionary(sb, rl, ru, offR, offC, mb.getDenseBlockValues()); + } + else if(_dict instanceof MatrixBlockDictionary) { final MatrixBlockDictionary md = (MatrixBlockDictionary) _dict; final MatrixBlock mb = md.getMatrixBlock(); // The dictionary is never empty. diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java index b0b2484ca25..2f94c3186a5 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupConst.java @@ -26,6 +26,7 @@ import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.Dictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.DictionaryFactory; +import org.apache.sysds.runtime.compress.colgroup.dictionary.IdentityDictionary; import org.apache.sysds.runtime.compress.colgroup.dictionary.MatrixBlockDictionary; import org.apache.sysds.runtime.compress.colgroup.indexes.ColIndexFactory; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; @@ -305,7 +306,14 @@ public AColGroup binaryRowOpRight(BinaryOperator op, double[] v, boolean isRowSa * @param constV The output columns. */ public final void addToCommon(double[] constV) { - if(_dict instanceof MatrixBlockDictionary) { + if(_dict instanceof IdentityDictionary){ + MatrixBlock mb = ((IdentityDictionary) _dict).getMBDict().getMatrixBlock(); + if(mb.isInSparseFormat()) + addToCommonSparse(constV, mb.getSparseBlock()); + else + addToCommonDense(constV, mb.getDenseBlockValues()); + } + else if(_dict instanceof MatrixBlockDictionary) { MatrixBlock mb = ((MatrixBlockDictionary) _dict).getMatrixBlock(); if(mb.isInSparseFormat()) addToCommonSparse(constV, mb.getSparseBlock()); diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java index eacb71f4efb..954122620cf 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCSingleZeros.java @@ -96,10 +96,11 @@ protected void decompressToDenseBlockDenseDictionary(DenseBlock db, int rl, int if(it == null) return; else if(it.value() >= ru) - _indexes.cacheIterator(it, ru); + return; + // _indexes.cacheIterator(it, ru); else { decompressToDenseBlockDenseDictionaryWithProvidedIterator(db, rl, ru, offR, offC, values, it); - _indexes.cacheIterator(it, ru); + // _indexes.cacheIterator(it, ru); } } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java index a1700b16fe7..2e99179eaf4 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/ColGroupSDCZeros.java @@ -113,7 +113,7 @@ else if(it.value() >= ru) _indexes.cacheIterator(it, ru); else { decompressToDenseBlockDenseDictionaryWithProvidedIterator(db, rl, ru, offR, offC, values, it); - _indexes.cacheIterator(it, ru); + // _indexes.cacheIterator(it, ru); } } @@ -322,7 +322,7 @@ private final void decompressToDenseBlockSparseDictionaryPre(DenseBlock db, int it.next(); } - _indexes.cacheIterator(it, ru); + // _indexes.cacheIterator(it, ru); } @Override diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java index 43afc6ebd81..d80a5cca623 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionary.java @@ -26,10 +26,10 @@ import java.util.Arrays; import org.apache.commons.lang.NotImplementedException; -import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.DMLCompressionException; import org.apache.sysds.runtime.compress.colgroup.indexes.IColIndex; import org.apache.sysds.runtime.data.SparseBlock; +import org.apache.sysds.runtime.data.SparseBlockFactory; import org.apache.sysds.runtime.functionobjects.Builtin; import org.apache.sysds.runtime.functionobjects.Builtin.BuiltinCode; import org.apache.sysds.runtime.functionobjects.ValueFunction; @@ -61,13 +61,13 @@ public IdentityDictionary(int nRowCol) { @Override public double[] getValues() { - LOG.warn("Should not call getValues on Identity Dictionary"); - - double[] ret = new double[nRowCol * nRowCol]; - for(int i = 0; i < nRowCol; i++) { - ret[(i * nRowCol) + i] = 1; - } - return ret; + throw new DMLCompressionException("Invalid to materialize identity Matrix Please Implement alternative"); + // LOG.warn("Should not call getValues on Identity Dictionary"); + // double[] ret = new double[nRowCol * nRowCol]; + // for(int i = 0; i < nRowCol; i++) { + // ret[(i * nRowCol) + i] = 1; + // } + // return ret; } @Override @@ -383,8 +383,7 @@ public ADictionary subtractTuple(double[] tuple) { } public MatrixBlockDictionary getMBDict() { - throw new DMLRuntimeException("Do not make MB Dict"); - // return getMBDict(nRowCol); + return getMBDict(nRowCol); } @Override @@ -400,10 +399,8 @@ public MatrixBlockDictionary getMBDict(int nCol) { } private MatrixBlockDictionary createMBDict() { - MatrixBlock identity = new MatrixBlock(nRowCol, nRowCol, true); - for(int i = 0; i < nRowCol; i++) - identity.quickSetValue(i, i, 1.0); - + final SparseBlock sb = SparseBlockFactory.createIdentityMatrix(nRowCol); + final MatrixBlock identity = new MatrixBlock(nRowCol, nRowCol, nRowCol, sb); return new MatrixBlockDictionary(identity); } diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java index 53dfc3227b7..55315b34516 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/dictionary/IdentityDictionarySlice.java @@ -217,11 +217,6 @@ public long getNumberNonZeros(int[] counts, int nCol) { return (long) sum(counts, nCol); } - - public MatrixBlockDictionary getMBDict() { - return getMBDict(nRowCol); - } - @Override public MatrixBlockDictionary getMBDict(int nCol) { if(cache != null) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java index 1e6767a649f..c17c761cccc 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java +++ b/src/main/java/org/apache/sysds/runtime/compress/colgroup/offset/AOffset.java @@ -95,7 +95,7 @@ public AIterator getIterator(int row) { return getIterator(); else if(row > getOffsetToLast()) return null; - final OffsetCache c = cacheRow.get(); + final OffsetCache c = getLength() < skipStride ? null : cacheRow.get(); if(c != null && c.row == row) return c.it.clone(); else if(getLength() < skipStride) @@ -169,7 +169,7 @@ public AOffsetIterator getOffsetIterator(int row) { * @param row The row index to cache the iterator as. */ public void cacheIterator(AIterator it, int row) { - if(it == null) + if(it == null || getLength() < skipStride) return; cacheRow.set(new OffsetCache(it, row)); } @@ -446,6 +446,12 @@ public boolean equals(AOffset b) { protected abstract AOffset moveIndex(int m); + /** + * Get the length of the underlying array. This does not reflect the number of contained elements, since some of the + * elements can be skips. + * + * @return The length of the underlying arrays + */ protected abstract int getLength(); public OffsetSliceInfo slice(int l, int u) { diff --git a/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java b/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java index 8c692fe1581..31852dde5bc 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java +++ b/src/main/java/org/apache/sysds/runtime/compress/io/WriterCompressed.java @@ -180,8 +180,10 @@ private void writeMultiBlockCompressed(String fname, MatrixBlock b, final int rl for(int bc = 0; bc * blen < clen; bc++) {// column blocks final int sC = bc * blen; final int mC = Math.min(sC + blen, clen) - 1; + // slice out the current columns final CompressedMatrixBlock mc = CLALibSlice.sliceColumns((CompressedMatrixBlock) b, sC, mC); - final List blocks = CLALibSlice.sliceBlocks(mc, blen); // Slice compressed blocks + // slice out row blocks in this. + final List blocks = CLALibSlice.sliceBlocks(mc, blen, k); // Slice compressed blocks final int blocksPerThread = Math.max(1, blocks.size() / k); for(int block = 0; block < blocks.size(); block += blocksPerThread, i++) { final Path newPath = new Path(fname, IOUtilFunctions.getPartFileName(i)); diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java index f6bb86c30b7..f92533466b0 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java +++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibDecompress.java @@ -177,20 +177,18 @@ private static MatrixBlock decompressExecute(CompressedMatrixBlock cmb, int k) { if(k == 1) { if(ret.isInSparseFormat()) { decompressSparseSingleThread(ret, filteredGroups, nRows, blklen); - ret.setNonZeros(nonZeros); } else { decompressDenseSingleThread(ret, filteredGroups, nRows, blklen, constV, eps, nonZeros, overlapping); - ret.recomputeNonZeros(); - // ret.setNonZeros(nonZeros == -1 || overlapping ? ret.recomputeNonZeros() : nonZeros); } } else if(ret.isInSparseFormat()) { decompressSparseMultiThread(ret, filteredGroups, nRows, blklen, k); - ret.setNonZeros(nonZeros); } - else + else{ decompressDenseMultiThread(ret, filteredGroups, nRows, blklen, constV, eps, k, overlapping); + } + ret.recomputeNonZeros(); ret.examSparsity(); return ret; diff --git a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java index 32fa6c1b595..4b4ab0e4298 100644 --- a/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java +++ b/src/main/java/org/apache/sysds/runtime/compress/lib/CLALibSlice.java @@ -21,14 +21,19 @@ import java.util.ArrayList; import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Future; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; import org.apache.sysds.runtime.compress.colgroup.AColGroup; import org.apache.sysds.runtime.compress.colgroup.ColGroupConst; import org.apache.sysds.runtime.data.DenseBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.util.CommonThreadPool; public class CLALibSlice { @@ -39,13 +44,16 @@ public class CLALibSlice { * * @param cmb The input block to slice. * @param blen The length of the blocks. + * @param k The parallelization degree used * @return A list containing CompressedMatrixBlocks or MatrixBlocks */ - public static List sliceBlocks(CompressedMatrixBlock cmb, int blen) { - final List mbs = new ArrayList<>(); - for(int b = 0; b < cmb.getNumRows(); b += blen) - mbs.add(sliceRowsCompressed(cmb, b, Math.min(b + blen, cmb.getNumRows()) - 1)); - return mbs; + public static List sliceBlocks(CompressedMatrixBlock cmb, int blen, int k) { + + if(k <= 1) + return sliceBlocksSingleThread(cmb, blen); + else + return sliceBlocksMultiThread(cmb, blen, k); + } public static MatrixBlock slice(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu, boolean deep) { @@ -59,6 +67,34 @@ else if(cl == 0 && cu == cmb.getNumColumns() - 1) return sliceInternal(cmb, rl, ru, cl, cu, deep); } + private static List sliceBlocksSingleThread(CompressedMatrixBlock cmb, int blen) { + final List mbs = new ArrayList<>(); + for(int b = 0; b < cmb.getNumRows(); b += blen) + mbs.add(sliceRowsCompressed(cmb, b, Math.min(b + blen, cmb.getNumRows()) - 1)); + return mbs; + } + + private static List sliceBlocksMultiThread(CompressedMatrixBlock cmb, int blen, int k) { + // final List mbs = new ArrayList<>(); + + final ExecutorService pool = CommonThreadPool.get(k); + try { + final ArrayList tasks = new ArrayList<>(); + + for(int b = 0; b < cmb.getNumRows(); b += blen) + tasks.add(new SliceTask(cmb, b, blen)); + final List mbs = new ArrayList<>(tasks.size()); + for(Future f : pool.invokeAll(tasks)) + mbs.add(f.get()); + pool.shutdown(); + return mbs; + } + catch(Exception e) { + pool.shutdown(); + throw new DMLRuntimeException("Failed slicing compressed matrix block", e); + } + } + private static MatrixBlock sliceInternal(CompressedMatrixBlock cmb, int rl, int ru, int cl, int cu, boolean deep) { /** * In the case where an internal matrix is sliced out, then first slice out the columns to an compressed @@ -148,4 +184,21 @@ public static CompressedMatrixBlock sliceColumns(CompressedMatrixBlock cmb, int ret.setOverlapping(cmb.isOverlapping()); return ret; } + + private static class SliceTask implements Callable { + private final CompressedMatrixBlock cmb; + private final int b; + private final int blen; + + private SliceTask(CompressedMatrixBlock cmb, int b, int blen) { + this.cmb = cmb; + this.b = b; + this.blen = blen; + } + + @Override + public MatrixBlock call() throws Exception { + return sliceRowsCompressed(cmb, b, Math.min(b + blen, cmb.getNumRows()) - 1); + } + } } diff --git a/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java b/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java index fdaf3d7460e..18bd68489e0 100644 --- a/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java +++ b/src/main/java/org/apache/sysds/runtime/data/SparseBlockFactory.java @@ -84,4 +84,20 @@ public static long estimateSizeSparseInMemory(SparseBlock.Type type, long nrows, throw new RuntimeException("Unexpected sparse block type: "+type.toString()); } } + + public static SparseBlock createIdentityMatrix(int nRowCol){ + final int[] rowPtr = new int[nRowCol+1]; + final int[] colIdx = new int[nRowCol]; + final double[] vals = new double[nRowCol]; + int nnz = nRowCol; + + for(int i = 0; i < nRowCol; i++){ + rowPtr[i] = i; + colIdx[i] = i; + vals[i] = 1; + } + rowPtr[nRowCol] = nRowCol; // add last index for row pointers. + + return new SparseBlockCSR(rowPtr, colIdx, vals, nnz); + } } diff --git a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java index 862014b39eb..4e89dbf548a 100644 --- a/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java +++ b/src/main/java/org/apache/sysds/runtime/frame/data/columns/StringArray.java @@ -561,12 +561,38 @@ public void fill(String value) { @Override public double getAsDouble(int i) { - return _data[i] != null && !_data[i].isEmpty() ? DoubleArray.parseDouble(_data[i]) : 0.0; + if(_data[i] != null && !_data[i].isEmpty()){ + return getAsDouble(_data[i]); + } + else{ + return 0.0; + } } @Override public double getAsNaNDouble(int i) { - return _data[i] != null && !_data[i].isEmpty() ? DoubleArray.parseDouble(_data[i]) : Double.NaN; + if(_data[i] != null && !_data[i].isEmpty()){ + return getAsDouble(_data[i]); + } + else{ + return Double.NaN; + } + } + + private static double getAsDouble(String s){ + try{ + + return DoubleArray.parseDouble(s); + } + catch(Exception e){ + String ls = s.toLowerCase(); + if(ls.equals("true") || ls.equals("t")) + return 1; + else if (ls.equals("false") || ls.equals("f")) + return 0; + else + throw new DMLRuntimeException("Unable to change to double: " + s, e); + } } @Override diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java index b532dc04be4..7051d3f3015 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderBin.java @@ -19,6 +19,8 @@ package org.apache.sysds.runtime.transform.encode; +import static org.apache.sysds.runtime.util.UtilFunctions.getEndIndex; + import java.io.IOException; import java.io.ObjectInput; import java.io.ObjectOutput; @@ -27,7 +29,6 @@ import java.util.PriorityQueue; import java.util.concurrent.Callable; -import static org.apache.sysds.runtime.util.UtilFunctions.getEndIndex; import org.apache.commons.lang3.tuple.MutableTriple; import org.apache.sysds.api.DMLScript; import org.apache.sysds.lops.Lop; @@ -137,58 +138,67 @@ else if(_binMethod == BinMethod.EQUI_HEIGHT) { protected double getCode(CacheBlock in, int row){ // find the right bucket for a single row - double bin = 0; if( _binMins.length == 0 || _binMaxs.length == 0 ) { LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1"); return 1; //robustness in case of missing bins } // Returns NaN if value is missing, so can't be assigned a Bin double inVal = in.getDoubleNaN(row, _colID - 1); - if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > _binMaxs[_binMaxs.length-1]) - return Double.NaN; - if (_binMethod == BinMethod.EQUI_HEIGHT) { - int ix = Arrays.binarySearch(_binMaxs, inVal); - bin = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1; - } - if (_binMethod == BinMethod.EQUI_WIDTH) { - //TODO: Skip computing bin boundaries for equi-width - double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin; - double code = Math.ceil((inVal - _binMins[0]) / binWidth); - bin = (code == 0) ? code + 1 : code; - } - return bin; + return getCodeIndex(inVal); } @Override protected double[] getCodeCol(CacheBlock in, int startInd, int blkSize) { // find the right bucket for a block of rows - int endInd = getEndIndex(in.getNumRows(), startInd, blkSize); - double[] codes = new double[endInd-startInd]; + final int endInd = getEndIndex(in.getNumRows(), startInd, blkSize); + final double[] codes = new double[endInd - startInd]; + if (_binMins == null || _binMins.length == 0 || _binMaxs.length == 0) { + LOG.warn("ColumnEncoderBin: applyValue without bucket boundaries, assign 1"); + Arrays.fill(codes, startInd, endInd, 1.0); + return codes; + } for (int i=startInd; i _binMaxs[_binMaxs.length-1]) { - codes[i-startInd] = Double.NaN; - continue; - } - if (_binMethod == BinMethod.EQUI_HEIGHT) { - int ix = Arrays.binarySearch(_binMaxs, inVal); - codes[i-startInd] = ((ix < 0) ? Math.abs(ix + 1) : ix) + 1; - } - if (_binMethod == BinMethod.EQUI_WIDTH) { - //TODO: Skip computing bin boundaries for equi-width - double binWidth = (_binMaxs[_binMaxs.length - 1] - _binMins[0]) / _numBin; - double bin = Math.ceil((inVal - _binMins[0]) / binWidth); - codes[i - startInd] = bin == 0 ? bin + 1 : bin; - } + codes[i- startInd] = getCodeIndex(inVal); } return codes; } + protected double getCodeIndex(double inVal){ + // throw new NotImplementedException("Intensional"); + if (Double.isNaN(inVal) || inVal < _binMins[0] || inVal > _binMaxs[_binMaxs.length-1]){ + return Double.NaN; + } + else if (_binMethod == BinMethod.EQUI_HEIGHT) { + final int ix = Arrays.binarySearch(_binMaxs, inVal); + + if(ix < 0) // somewhere in between values + // +2 because negative values are found from binary search. + // plus 2 to correct for the absolute value of that. + return Math.abs(ix + 1) + 1; + else if (ix == 0) // If first bucket boundary add it there. + return 1; + else + // precisely at boundaries default to lower bucket + // This is done to avoid using an extra bucket for max value. + return Math.min(ix + 1, _binMaxs.length); + } + else { //if (_binMethod == BinMethod.EQUI_WIDTH) { + final double max = _binMaxs[_binMaxs.length-1]; + final double min = _binMins[0]; + + if(max == min){ + return 1; + } + + //TODO: Skip computing bin boundaries for equi-width + double binWidth = (max - min) / _numBin; + double code = Math.ceil((inVal - min) / binWidth); + return (code == 0) ? code + 1 : code; + } + } + + @Override protected TransformType getTransformType() { return TransformType.BIN; @@ -393,6 +403,16 @@ public String toString() { sb.append(getClass().getSimpleName()); sb.append(": "); sb.append(_colID); + sb.append(" --- Method: " + _binMethod + " num Bin: " + _numBin); + if(_binMethod == BinMethod.EQUI_WIDTH){ + + sb.append("\n---- BinMin: "+ Arrays.toString(_binMins)); + sb.append("\n---- BinMax: "+ Arrays.toString(_binMaxs)); + } + else{ + + sb.append(" --- MinMax: "+ _colMins + " " + _colMaxs); + } return sb.toString(); } diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java index a033bfa30fc..8b9710f71d6 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/ColumnEncoderComposite.java @@ -435,6 +435,17 @@ public boolean isPassThrough(){ && _columnEncoders.get(0) instanceof ColumnEncoderPassThrough; } + public boolean isBin(){ + return _columnEncoders.size() == 1// + && _columnEncoders.get(0) instanceof ColumnEncoderBin; + } + + public boolean isBinToDummy(){ + return _columnEncoders.size() == 2// + && _columnEncoders.get(0) instanceof ColumnEncoderBin// + && _columnEncoders.get(1) instanceof ColumnEncoderDummycode; + } + private static class ColumnCompositeUpdateDCTask implements Callable { private final ColumnEncoderComposite _encoder; diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java b/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java index 1605e08e7dc..8a60aae6ff7 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/CompressedEncode.java @@ -36,6 +36,7 @@ import org.apache.sysds.runtime.DMLRuntimeException; import org.apache.sysds.runtime.compress.CompressedMatrixBlock; import org.apache.sysds.runtime.compress.colgroup.AColGroup; +import org.apache.sysds.runtime.compress.colgroup.ColGroupConst; import org.apache.sysds.runtime.compress.colgroup.ColGroupDDC; import org.apache.sysds.runtime.compress.colgroup.ColGroupUncompressed; import org.apache.sysds.runtime.compress.colgroup.dictionary.ADictionary; @@ -136,6 +137,10 @@ else if(c.isRecode()) return recode(c); else if(c.isPassThrough()) return passThrough(c); + else if(c.isBin()) + return bin(c); + else if(c.isBinToDummy()) + return binToDummy(c); else throw new NotImplementedException("Not supporting : " + c); } @@ -147,6 +152,8 @@ private AColGroup recodeToDummy(ColumnEncoderComposite c) { HashMap map = a.getRecodeMap(); int domain = map.size(); IColIndex colIndexes = ColIndexFactory.create(0, domain); + if(domain == 1) + return ColGroupConst.create(colIndexes, new double[] {1}); ADictionary d = new IdentityDictionary(colIndexes.size()); AMapToData m = createMappingAMapToData(a, map); List r = c.getEncoders(); @@ -154,6 +161,75 @@ private AColGroup recodeToDummy(ColumnEncoderComposite c) { return ColGroupDDC.create(colIndexes, d, m, null); } + private AColGroup bin(ColumnEncoderComposite c) { + final int colId = c._colID; + final Array a = in.getColumn(colId - 1); + final boolean containsNull = a.containsNull(); + final List r = c.getEncoders(); + final ColumnEncoderBin b = (ColumnEncoderBin) r.get(0); + b.build(in); + final IColIndex colIndexes = ColIndexFactory.create(1); + + ADictionary d = createIncrementingVector(b._numBin, containsNull); + AMapToData m = binEncode(a, b, containsNull); + + AColGroup ret = ColGroupDDC.create(colIndexes, d, m, null); + try{ + + ret.getNumberNonZeros(a.size()); + } + catch(Exception e){ + throw new DMLRuntimeException("Failed binning \n\n" + a + "\n" + b + "\n" + d + "\n" + m,e); + } + return ret; + } + + private AMapToData binEncode(Array a, ColumnEncoderBin b, boolean containsNull) { + AMapToData m = MapToFactory.create(a.size(), b._numBin + (containsNull ? 1 : 0)); + if(containsNull) { + for(int i = 0; i < a.size(); i++) { + double v = a.getAsNaNDouble(i); + if(Double.isNaN(v)) + m.set(i, b._numBin); + else + m.set(i, (int) b.getCodeIndex(v) - 1); + } + } + else { + for(int i = 0; i < a.size(); i++) + m.set(i, (int) b.getCodeIndex(a.getAsDouble(i)) - 1); + } + return m; + } + + private MatrixBlockDictionary createIncrementingVector(int nVals, boolean NaN) { + + MatrixBlock bins = new MatrixBlock(nVals + (NaN ? 1 : 0), 1, false); + for(int i = 0; i < nVals; i++) + bins.quickSetValue(i, 0, i + 1); + if(NaN) + bins.quickSetValue(nVals, 0, Double.NaN); + + return MatrixBlockDictionary.create(bins); + + } + + private AColGroup binToDummy(ColumnEncoderComposite c) { + final int colId = c._colID; + final Array a = in.getColumn(colId - 1); + final boolean containsNull = a.containsNull(); + final List r = c.getEncoders(); + final ColumnEncoderBin b = (ColumnEncoderBin) r.get(0); + b.build(in); + + IColIndex colIndexes = ColIndexFactory.create(0, b._numBin + (containsNull ? 1 : 0)); + ADictionary d = new IdentityDictionary(colIndexes.size()); + AMapToData m = binEncode(a, b, containsNull); + AColGroup ret = ColGroupDDC.create(colIndexes, d, m, null); + ret.getNumberNonZeros(a.size()); + return ret; + } + @SuppressWarnings("unchecked") private AColGroup recode(ColumnEncoderComposite c) { int colId = c._colID; @@ -163,6 +239,8 @@ private AColGroup recode(ColumnEncoderComposite c) { // int domain = c.getDomainSize(); IColIndex colIndexes = ColIndexFactory.create(1); + if(domain == 1) + return ColGroupConst.create(colIndexes, new double[] {1}); MatrixBlock incrementing = new MatrixBlock(domain, 1, false); for(int i = 0; i < domain; i++) incrementing.quickSetValue(i, 0, i + 1); diff --git a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java index 140a5e18b37..6838cdd1e29 100644 --- a/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java +++ b/src/main/java/org/apache/sysds/runtime/transform/encode/MultiColumnEncoder.java @@ -111,11 +111,9 @@ else if(k > 1 && !MULTI_THREADED_STAGES && !hasLegacyEncoder()) { try { pool.submitAllAndWait(getEncodeTasks(in, out, pool)); } - catch(ExecutionException | InterruptedException e) { - LOG.error("MT Column encode failed"); - e.printStackTrace(); + finally{ + pool.shutdown(); } - pool.shutdown(); outputMatrixPostProcessing(out); return out; } @@ -142,8 +140,7 @@ else if(k > 1 && !MULTI_THREADED_STAGES && !hasLegacyEncoder()) { } } catch(Exception ex) { - LOG.error("Failed transform-encode frame with \n" + this); - throw ex; + throw new DMLRuntimeException("Failed transform-encode frame with encoder:\n" + this, ex); } } diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCompressed.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestLogger.java similarity index 60% rename from src/test/java/org/apache/sysds/test/component/frame/transform/transformCompressed.java rename to src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestLogger.java index 08a15c9aa2a..59b79bfe5e6 100644 --- a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCompressed.java +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestLogger.java @@ -19,24 +19,34 @@ package org.apache.sysds.test.component.frame.transform; +import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +import java.util.ArrayList; +import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.log4j.Appender; +import org.apache.log4j.AppenderSkeleton; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.log4j.spi.LoggingEvent; import org.apache.sysds.common.Types.ValueType; import org.apache.sysds.runtime.frame.data.FrameBlock; import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.CompressedEncode; import org.apache.sysds.runtime.transform.encode.EncoderFactory; import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; import org.apache.sysds.test.TestUtils; import org.junit.Test; -public class transformCompressed { - protected static final Log LOG = LogFactory.getLog(transformCompressed.class.getName()); +public class TransformCompressedTestLogger { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestLogger.class.getName()); private final FrameBlock data; - public transformCompressed() { + public TransformCompressedTestLogger() { try { data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT4}, 231); @@ -49,60 +59,35 @@ public transformCompressed() { } } - @Test - public void testRecode() { - test("{recode:[C1]}"); - } - @Test public void testDummyCode() { test("{dummycode:[C1]}"); } - // @Test - // public void testBin() { - // test("{ids:true, bin:[{id:1, method:equi-width, numbins:4}]}"); - // } - - // @Test - // public void testBin2() { - // test("{ids:true, bin:[{id:1, method:equi-width, numbins:100}]}"); - // } - - // @Test - // public void testBin3() { - // test("{ids:true, bin:[{id:1, method:equi-width, numbins:2}]}"); - // } - - // @Test - // public void testBin4() { - // test("{ids:true, bin:[{id:1, method:equi-height, numbins:2}]}"); - // } - - // @Test - // public void testBin5() { - // test("{ids:true, bin:[{id:1, method:equi-height, numbins:10}]}"); - // } - public void test(String spec) { + final TestAppender appender = new TestAppender(); + final Logger logger = Logger.getRootLogger(); + Appender consoleLogger = (Appender) logger.getAllAppenders().nextElement(); try { - + Logger.getLogger(CompressedEncode.class).setLevel(Level.DEBUG); + + FrameBlock meta = null; + logger.removeAppender(consoleLogger); + logger.addAppender(appender); MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), - data.getNumColumns(), meta); + data.getNumColumns(), meta); MatrixBlock outCompressed = encoderCompressed.encode(data, true); FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), - data.getNumColumns(), meta); + data.getNumColumns(), meta); MatrixBlock outNormal = encoderNormal.encode(data); FrameBlock outNormalMD = encoderNormal.getMetaData(null); - - - // LOG.error(outNormal); - // LOG.error(outCompressed); - // LOG.error(outCompressedMD); - // LOG.error(outNormalMD); + logger.removeAppender(appender); + logger.addAppender(consoleLogger); + final List log = appender.getLog(); + assertTrue(log.get(3).getMessage().toString().contains("Compression ratio")); TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); TestUtils.compareFrames(outNormalMD, outCompressedMD, true); } @@ -110,5 +95,33 @@ public void test(String spec) { e.printStackTrace(); fail(e.getMessage()); } + finally { + logger.removeAppender(appender); + logger.addAppender(consoleLogger); + } + + } + + class TestAppender extends AppenderSkeleton { + private final List log = new ArrayList(); + + @Override + public boolean requiresLayout() { + return false; + } + + @Override + protected void append(final LoggingEvent loggingEvent) { + log.add(loggingEvent); + } + + @Override + public void close() { + } + + public List getLog() { + return new ArrayList(log); + } + } } diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestMultiCol.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestMultiCol.java new file mode 100644 index 00000000000..e945ad5b269 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestMultiCol.java @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.frame.transform; + +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(value = Parameterized.class) +public class TransformCompressedTestMultiCol { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestMultiCol.class.getName()); + + private final FrameBlock data; + private final int k; + + public TransformCompressedTestMultiCol(FrameBlock data, int k) { + this.data = data; + this.k = k; + } + + @Parameters + public static Collection data() { + final ArrayList tests = new ArrayList<>(); + final int[] threads = new int[] {1, 4}; + try { + + FrameBlock data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT4, ValueType.UINT8, ValueType.UINT4}, 231); + data.setSchema(new ValueType[] {ValueType.INT32, ValueType.INT32, ValueType.INT32}); + for(int k : threads) { + tests.add(new Object[] {data, k}); + } + + FrameBlock data2 = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.BOOLEAN, ValueType.UINT8, ValueType.UINT4}, 231); + data2.setSchema(new ValueType[] {ValueType.BOOLEAN, ValueType.INT32, ValueType.INT32}); + for(int k : threads) { + tests.add(new Object[] {data2, k}); + } + + FrameBlock data3 = new FrameBlock( + new ValueType[] {ValueType.BOOLEAN, ValueType.INT32, ValueType.INT32}, 100) ; + data3.ensureAllocatedColumns(100); + for(int k : threads) + tests.add(new Object[] {data3, k}); + + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + return tests; + } + + @Test + public void testRecode() { + test("{recode:[C1, C2, C3]}"); + } + + @Test + public void testDummyCode() { + test("{dummycode:[C1,C2,C3]}"); + } + + @Test + public void testBin() { + test( + "{ids:true, bin:[{id:1, method:equi-width, numbins:4},{id:2, method:equi-width, numbins:16},{id:3, method:equi-width, numbins:32}]}"); + } + + @Test + public void testBin3() { + test("{ids:true, bin:[{id:2, method:equi-width, numbins:2},{id:3, method:equi-width, numbins:200}]}"); + } + + @Test + public void passThrough() { + test("{ids:true}"); + } + + @Test + public void testBinToDummy() { + test( + "{ids:true, bin:[{id:1, method:equi-height, numbins:10},{id:2, method:equi-height, numbins:10},{id:3, method:equi-height, numbins:40}], dummycode:[1,2,3] }"); + } + + public void test(String spec) { + try { + + FrameBlock meta = null; + MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + + MatrixBlock outCompressed = encoderCompressed.encode(data, k, true); + FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); + MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + MatrixBlock outNormal = encoderNormal.encode(data, k); + FrameBlock outNormalMD = encoderNormal.getMetaData(null); + + + TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); + TestUtils.compareFrames(outNormalMD, outCompressedMD, true); + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleCol.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleCol.java new file mode 100644 index 00000000000..9855b359fa6 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleCol.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.frame.transform; + +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(value = Parameterized.class) +public class TransformCompressedTestSingleCol { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestSingleCol.class.getName()); + + private final FrameBlock data; + private final int k; + + public TransformCompressedTestSingleCol(FrameBlock data, int k) { + this.data = data; + this.k = k; + } + + @Parameters + public static Collection data() { + final ArrayList tests = new ArrayList<>(); + final int[] threads = new int[] {1, 4}; + try { + + FrameBlock data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT4}, 231); + data.setSchema(new ValueType[] {ValueType.INT32}); + for(int k : threads) { + tests.add(new Object[] {data, k}); + } + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + return tests; + } + + @Test + public void testRecode() { + test("{recode:[C1]}"); + } + + @Test + public void testDummyCode() { + test("{dummycode:[C1]}"); + } + + @Test + public void testBin() { + test("{ids:true, bin:[{id:1, method:equi-width, numbins:4}]}"); + } + + @Test + public void testBin2() { + test("{ids:true, bin:[{id:1, method:equi-width, numbins:100}]}"); + } + + @Test + public void testBin3() { + test("{ids:true, bin:[{id:1, method:equi-width, numbins:2}]}"); + } + + @Test + public void testBin4() { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:2}]}"); + } + + @Test + public void testBin5() { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:10}]}"); + } + + @Test + public void passThrough() { + test("{ids:true}"); + } + + @Test + public void testBinToDummy() { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:10}], dummycode:[1] }"); + } + + public void test(String spec) { + try { + + FrameBlock meta = null; + MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + + MatrixBlock outCompressed = encoderCompressed.encode(data, k, true); + FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); + MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + MatrixBlock outNormal = encoderNormal.encode(data, k); + FrameBlock outNormalMD = encoderNormal.getMetaData(null); + + TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); + TestUtils.compareFrames(outNormalMD, outCompressedMD, true); + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleColBinSpecific.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleColBinSpecific.java new file mode 100644 index 00000000000..fffe66ab959 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCompressedTestSingleColBinSpecific.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.sysds.test.component.frame.transform; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.fail; + +import java.util.ArrayList; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.sysds.common.Types.ValueType; +import org.apache.sysds.runtime.frame.data.FrameBlock; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.runtime.transform.encode.EncoderFactory; +import org.apache.sysds.runtime.transform.encode.MultiColumnEncoder; +import org.apache.sysds.test.TestUtils; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.junit.runners.Parameterized.Parameters; + +@RunWith(value = Parameterized.class) +public class TransformCompressedTestSingleColBinSpecific { + protected static final Log LOG = LogFactory.getLog(TransformCompressedTestSingleColBinSpecific.class.getName()); + + private final FrameBlock data; + private final int k; + + public TransformCompressedTestSingleColBinSpecific(FrameBlock data, int k) { + this.data = data; + this.k = k; + } + + @Parameters + public static Collection data() { + final ArrayList tests = new ArrayList<>(); + final int[] threads = new int[] {1, 4}; + try { + + FrameBlock data = TestUtils.generateRandomFrameBlock(120, new ValueType[] {ValueType.FP64}, 231); + data.setSchema(new ValueType[] {ValueType.FP64}); + for(int k : threads) { + tests.add(new Object[] {data, k}); + } + + FrameBlock data2 = TestUtils.generateRandomFrameBlock(1200, new ValueType[] {ValueType.FP64}, 231); + data2.setSchema(new ValueType[] {ValueType.FP64}); + for(int k : threads) { + tests.add(new Object[] {data2, k}); + } + } + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + return tests; + } + + @Test + public void test1() { + test(1); + } + + @Test + public void test2() { + test(2); + } + + @Test + public void test3() { + test(3); + } + + @Test + public void test4() { + test(4); + } + + public void test(int bin) { + test("{ids:true, bin:[{id:1, method:equi-height, numbins:" + bin + "}], dummycode:[1] }"); + } + + public void test(String spec) { + try { + + FrameBlock meta = null; + MultiColumnEncoder encoderCompressed = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + + MatrixBlock outCompressed = encoderCompressed.encode(data, k, true); + FrameBlock outCompressedMD = encoderCompressed.getMetaData(null); + MultiColumnEncoder encoderNormal = EncoderFactory.createEncoder(spec, data.getColumnNames(), + data.getNumColumns(), meta); + MatrixBlock outNormal = encoderNormal.encode(data, k); + FrameBlock outNormalMD = encoderNormal.getMetaData(null); + + TestUtils.compareMatrices(outNormal, outCompressed, 0, "Not Equal after apply"); + TestUtils.compareFrames(outNormalMD, outCompressedMD, true); + + // Assert that each bucket has the same number of elements + MatrixBlock colSum = outNormal.colSum(); + for(int i = 0; i < colSum.getNumColumns(); i++) + assertEquals(colSum.quickGetValue(0, 0), colSum.quickGetValue(0, i), 0.001); + } + + catch(Exception e) { + e.printStackTrace(); + fail(e.getMessage()); + } + } +} diff --git a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCustomTest.java b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java similarity index 95% rename from src/test/java/org/apache/sysds/test/component/frame/transform/transformCustomTest.java rename to src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java index 67b23dd32a0..ce7c5d17d94 100644 --- a/src/test/java/org/apache/sysds/test/component/frame/transform/transformCustomTest.java +++ b/src/test/java/org/apache/sysds/test/component/frame/transform/TransformCustomTest.java @@ -31,12 +31,12 @@ import org.apache.sysds.test.TestUtils; import org.junit.Test; -public class transformCustomTest { - protected static final Log LOG = LogFactory.getLog(transformCustomTest.class.getName()); +public class TransformCustomTest { + protected static final Log LOG = LogFactory.getLog(TransformCustomTest.class.getName()); final FrameBlock data; - public transformCustomTest() { + public TransformCustomTest() { data = TestUtils.generateRandomFrameBlock(100, new ValueType[] {ValueType.UINT8}, 231); data.setSchema(new ValueType[] {ValueType.INT32}); } diff --git a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java index c91f3ddfc5c..77b6078c215 100644 --- a/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java +++ b/src/test/java/org/apache/sysds/test/functions/pipelines/BuiltinTopkCleaningClassificationTest.java @@ -83,7 +83,7 @@ private void runtopkCleaning(String data, String meta, Double sample, int topk, "metaData="+meta, "primitives="+PRIMITIVES, "parameters="+PARAM, "topk="+ topk, "rv="+ resources, "expectedIncrease="+inc, "max_iter="+10, "sample="+sample, "testCV="+cv, "cvk="+cvk, "split="+split, "output="+OUTPUT, "O="+output("O")}; - runTest(true, EXCEPTION_NOT_EXPECTED, null, -1); + runTest(null); //expected loss smaller than default invocation Assert.assertTrue(TestUtils.readDMLBoolean(output("O"))); diff --git a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java index 5aa586df366..ecf29502ab2 100644 --- a/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java +++ b/src/test/java/org/apache/sysds/test/functions/transform/TransformFrameEncodeApplyTest.java @@ -19,8 +19,8 @@ package org.apache.sysds.test.functions.transform; -import org.junit.Assert; -import org.junit.Test; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; import org.apache.sysds.api.DMLScript; import org.apache.sysds.common.Types.ExecMode; import org.apache.sysds.common.Types.FileFormat; @@ -30,8 +30,12 @@ import org.apache.sysds.test.TestConfiguration; import org.apache.sysds.test.TestUtils; import org.apache.sysds.utils.Statistics; +import org.junit.Assert; +import org.junit.Test; public class TransformFrameEncodeApplyTest extends AutomatedTestBase { + protected static final Log LOG = LogFactory.getLog(TransformFrameEncodeApplyTest.class.getName()); + private final static String TEST_NAME1 = "TransformFrameEncodeApply"; private final static String TEST_DIR = "functions/transform/"; private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplyTest.class.getSimpleName() + "/"; @@ -462,14 +466,17 @@ private void runTransformTest( ExecMode rt, String ofmt, TransformType type, boo } } else if (type == TransformType.BIN_HEIGHT_DUMMY) { Assert.assertEquals(14, R1[0].length); - for(int i=0; i<7; i++) { - for(int j=0; j<4; j++) { //check dummy coded - Assert.assertEquals((j==BIN_HEIGHT_col3[i]-1)? - 1:0, R1[i][2+j], 1e-8); + // LOG.error(DataConverter.convertToMatrixBlock(R1).slice(0,6)); + for(int i = 0; i < 7; i++) {// first 7 rows. + // LOG.error(Arrays.toString(BIN_HEIGHT_col3)); + // Check the columns specifically that were dummy coded. + for(int j = 0; j < 4; j++) { // check dummy coded + double val = j == BIN_HEIGHT_col3[i] - 1 ? 1 : 0; + Assert.assertEquals(val, R1[i][2 + j], 1e-8); } - for(int j=0; j<3; j++) { //check dummy coded - Assert.assertEquals((j==BIN_HEIGHT_col8[i]-1)? - 1:0, R1[i][10+j], 1e-8); + for(int j = 0; j < 3; j++) { // check dummy coded + double val = j == BIN_HEIGHT_col8[i] - 1 ? 1 : 0; + Assert.assertEquals(val, R1[i][10 + j], 1e-8); } } } else if (type == TransformType.IMPUTE){ diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv deleted file mode 100644 index 5db997f9a8b..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/applyFunc.csv +++ /dev/null @@ -1,3 +0,0 @@ -outlierBySdApply,winsorizeApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -normalizeApply,NA,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -imputeByMeanApply,outlierBySdApply,dummycodingApply,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv b/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv deleted file mode 100644 index 39fcddd2e11..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/bestAcc.csv +++ /dev/null @@ -1,3 +0,0 @@ -74.87179487179488 -74.87179487179488 -74.87179487179488 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv b/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv deleted file mode 100644 index fae86940b1f..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/dirtyScore.csv +++ /dev/null @@ -1 +0,0 @@ -74.87179487179488 \ No newline at end of file diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv deleted file mode 100644 index ec2047209de..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/evalHp.csv +++ /dev/null @@ -1 +0,0 @@ -2.0,10.0,0.001 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv b/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv deleted file mode 100644 index e9aacae6a5a..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/featureFrame.csv +++ /dev/null @@ -1 +0,0 @@ -#MissingValues,MinVla,MaxVal,AverageMin,AverageMax,#CategoricalFeatures,#NumericFeatures,Mean,#Outliers,#OHEfeatures,#Classes,Imbalance,#rows,#cols,pipelines,accuracy,execution time in ms,CV time in ms diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv deleted file mode 100644 index 49cc34605f8..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/hp.csv +++ /dev/null @@ -1,3 +0,0 @@ -27.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,2.0,0.05,0.95,0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -21.0,0,0,0,0,0,0,0,1.0,0.75,0,0,1.0,1.0,2.0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -27.0,0,0,0,0,1.0,0,0,0,2.0,3.0,3.0,2.0,1.0,0,0,0,1.0,0,0,0,0,0,1.0,0,0,0,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv b/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv deleted file mode 100644 index a7151853e90..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/lp.csv +++ /dev/null @@ -1,37 +0,0 @@ -outlierBySd,imputeByMedian,flipLabels,0 -imputeByMedian,outlierBySd,flipLabels,0 -abstain,0,0,0 -imputeByFd,abstain,0,0 -abstain,forward_fill,0,0 -flipLabels,0,0,0 -imputeByMedian,flipLabels,0,0 -flipLabels,forward_fill,0,0 -imputeByFd,flipLabels,forward_fill,0 -imputeByFd,forward_fill,flipLabels,0 -imputeByFd,flipLabels,forward_fill,0 -imputeByFd,flipLabels,0,0 -imputeByFd,imputeByFd,flipLabels,0 -imputeByFd,flipLabels,0,0 -imputeByFd,imputeByMean,flipLabels,0 -tomeklink,imputeByFd,abstain,0 -winsorize,0,0,0 -normalize,winsorize,0,0 -abstain,flipLabels,forward_fill,0 -imputeByMedian,0,0,0 -imputeByFd,0,0,0 -imputeByMean,0,0,0 -mice,0,0,0 -forward_fill,0,0,0 -fillDefault,0,0,0 -SMOTE,0,0,0 -scale,0,0,0 -fillDefault,imputeByMedian,0,0 -imputeByFd,imputeByFd,0,0 -imputeByFd,imputeByMean,0,0 -scale,imputeByFd,imputeByMean,0 -imputeByFd,imputeByMean,0,0 -imputeByFd,imputeByFd,imputeByMean,0 -imputeByMean,imputeByFd,imputeByFd,0 -imputeByFd,imputeByFd,0,0 -imputeByFd,forward_fill,imputeByFd,0 -forward_fill,imputeByFd,imputeByFd,0 diff --git a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv b/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv deleted file mode 100644 index 3fa4259115a..00000000000 --- a/src/test/scripts/functions/pipelines/intermediates/classification/pip.csv +++ /dev/null @@ -1,3 +0,0 @@ -outlierBySd,winsorize,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -normalize,abstain,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -imputeByMean,outlierBySd,dummycoding,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0